From 3a87562e8d1226bc0e8f2c6748251d056f34b1d3 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Mar 2025 21:54:12 -0700
Subject: [PATCH] scoring updates

---
 docs/_static/llama-stack-spec.html            | 1489 ++++++++---------
 docs/_static/llama-stack-spec.yaml            | 1161 +++++++------
 llama_stack/apis/benchmarks/benchmarks.py     |    1 -
 llama_stack/apis/eval/eval.py                 |   48 +-
 llama_stack/apis/scoring/scoring.py           |   15 +-
 .../scoring_functions/scoring_functions.py    |   98 +-
 6 files changed, 1346 insertions(+), 1466 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index d2a745655..493eeebc4 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -968,7 +968,60 @@
                 }
             }
         },
-        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The job that was created to run the evaluation.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Job"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "Run an evaluation on a benchmark.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EvaluateBenchmarkRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/eval/rows": {
             "post": {
                 "responses": {
                     "200": {
@@ -997,18 +1050,8 @@
                 "tags": [
                     "Eval"
                 ],
-                "description": "Evaluate a list of rows on a benchmark.",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "description": "The ID of the benchmark to run the evaluation on.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
+                "description": "Evaluate a list of rows on a candidate.",
+                "parameters": [],
                 "requestBody": {
                     "content": {
                         "application/json": {
@@ -3498,59 +3541,6 @@
                 }
             }
         },
-        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "The job that was created to run the evaluation.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/Job"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "Run an evaluation on a benchmark.",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "description": "The ID of the benchmark to run the evaluation on.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RunEvalRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/safety/run-shield": {
             "post": {
                 "responses": {
@@ -3708,7 +3698,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/ScoreBatchRequest"
+                                "$ref": "#/components/schemas/ScoreDatasetRequest"
                             }
                         }
                     },
@@ -6385,381 +6375,6 @@
                 "title": "AgentCandidate",
                 "description": "An agent candidate for evaluation."
             },
-            "AnswerCorrectnessScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "answer_correctness",
-                        "default": "answer_correctness"
-                    },
-                    "answer_correctness": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "answer_correctness"
-                ],
-                "title": "AnswerCorrectnessScoringFnParams"
-            },
-            "AnswerRelevancyScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "answer_relevancy",
-                        "default": "answer_relevancy"
-                    },
-                    "answer_relevancy": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "answer_relevancy"
-                ],
-                "title": "AnswerRelevancyScoringFnParams"
-            },
-            "AnswerSimilarityScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "answer_similarity",
-                        "default": "answer_similarity"
-                    },
-                    "answer_similarity": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "answer_similarity"
-                ],
-                "title": "AnswerSimilarityScoringFnParams"
-            },
-            "BenchmarkConfig": {
-                "type": "object",
-                "properties": {
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate",
-                        "description": "The candidate to evaluate."
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        },
-                        "description": "Map between scoring function id and parameters for each scoring function you want to run"
-                    },
-                    "num_examples": {
-                        "type": "integer",
-                        "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "eval_candidate",
-                    "scoring_params"
-                ],
-                "title": "BenchmarkConfig",
-                "description": "A benchmark configuration for evaluation."
-            },
-            "ContextEntityRecallScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "context_entity_recall",
-                        "default": "context_entity_recall"
-                    },
-                    "context_entity_recall": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "context_entity_recall"
-                ],
-                "title": "ContextEntityRecallScoringFnParams"
-            },
-            "ContextPrecisionScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "context_precision",
-                        "default": "context_precision"
-                    },
-                    "context_precision": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "context_precision"
-                ],
-                "title": "ContextPrecisionScoringFnParams"
-            },
-            "ContextRecallScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "context_recall",
-                        "default": "context_recall"
-                    },
-                    "context_recall": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "context_recall"
-                ],
-                "title": "ContextRecallScoringFnParams"
-            },
-            "ContextRelevancyScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "context_relevancy",
-                        "default": "context_relevancy"
-                    },
-                    "context_relevancy": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "context_relevancy"
-                ],
-                "title": "ContextRelevancyScoringFnParams"
-            },
-            "CustomLLMAsJudgeScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "custom_llm_as_judge",
-                        "default": "custom_llm_as_judge"
-                    },
-                    "custom_llm_as_judge": {
-                        "type": "object",
-                        "properties": {
-                            "type": {
-                                "type": "string",
-                                "const": "custom_llm_as_judge",
-                                "default": "custom_llm_as_judge"
-                            },
-                            "judge_model": {
-                                "type": "string"
-                            },
-                            "prompt_template": {
-                                "type": "string"
-                            },
-                            "judge_score_regexes": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "type",
-                            "judge_model"
-                        ],
-                        "title": "CustomLLMAsJudgeScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "custom_llm_as_judge"
-                ],
-                "title": "CustomLLMAsJudgeScoringFnParams"
-            },
-            "EqualityScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "equality",
-                        "default": "equality"
-                    },
-                    "equality": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "equality"
-                ],
-                "title": "EqualityScoringFnParams"
-            },
             "EvalCandidate": {
                 "oneOf": [
                     {
@@ -6777,82 +6392,6 @@
                     }
                 }
             },
-            "FactualityScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "factuality",
-                        "default": "factuality"
-                    },
-                    "factuality": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "factuality"
-                ],
-                "title": "FactualityScoringFnParams"
-            },
-            "FaithfulnessScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "faithfulness",
-                        "default": "faithfulness"
-                    },
-                    "faithfulness": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "faithfulness"
-                ],
-                "title": "FaithfulnessScoringFnParams"
-            },
             "ModelCandidate": {
                 "type": "object",
                 "properties": {
@@ -6883,209 +6422,37 @@
                 "title": "ModelCandidate",
                 "description": "A model candidate for evaluation."
             },
-            "RegexParserMathScoringFnParams": {
+            "EvaluateBenchmarkRequest": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "regex_parser_math_response",
-                        "default": "regex_parser_math_response"
-                    },
-                    "regex_parser_math_response": {
-                        "type": "object",
-                        "properties": {
-                            "parsing_regexes": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                },
-                                "description": "(Optional) Regexes to extract the answer from generated response."
-                            },
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "parsing_regexes"
-                        ],
-                        "title": "RegexParserScoringFnParamsFields"
+                    "candidate": {
+                        "$ref": "#/components/schemas/EvalCandidate",
+                        "description": "The candidate to evaluate on."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "regex_parser_math_response"
+                    "candidate"
                 ],
-                "title": "RegexParserMathScoringFnParams"
+                "title": "EvaluateBenchmarkRequest"
             },
-            "RegexParserScoringFnParams": {
+            "Job": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "regex_parser",
-                        "default": "regex_parser"
-                    },
-                    "regex_parser": {
-                        "type": "object",
-                        "properties": {
-                            "parsing_regexes": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string"
-                                },
-                                "description": "(Optional) Regexes to extract the answer from generated response."
-                            },
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "parsing_regexes"
-                        ],
-                        "title": "RegexParserScoringFnParamsFields"
+                    "job_id": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "regex_parser"
+                    "job_id"
                 ],
-                "title": "RegexParserScoringFnParams"
-            },
-            "ScoringFnParams": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/CustomLLMAsJudgeScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/RegexParserMathScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/EqualityScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/SubsetOfcoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/FactualityScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/FaithfulnessScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AnswerCorrectnessScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AnswerRelevancyScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AnswerSimilarityScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ContextEntityRecallScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ContextPrecisionScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ContextRecallScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ContextRelevancyScoringFnParams"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "custom_llm_as_judge": "#/components/schemas/CustomLLMAsJudgeScoringFnParams",
-                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
-                        "regex_parser_math_response": "#/components/schemas/RegexParserMathScoringFnParams",
-                        "equality": "#/components/schemas/EqualityScoringFnParams",
-                        "subset_of": "#/components/schemas/SubsetOfcoringFnParams",
-                        "factuality": "#/components/schemas/FactualityScoringFnParams",
-                        "faithfulness": "#/components/schemas/FaithfulnessScoringFnParams",
-                        "answer_correctness": "#/components/schemas/AnswerCorrectnessScoringFnParams",
-                        "answer_relevancy": "#/components/schemas/AnswerRelevancyScoringFnParams",
-                        "answer_similarity": "#/components/schemas/AnswerSimilarityScoringFnParams",
-                        "context_entity_recall": "#/components/schemas/ContextEntityRecallScoringFnParams",
-                        "context_precision": "#/components/schemas/ContextPrecisionScoringFnParams",
-                        "context_recall": "#/components/schemas/ContextRecallScoringFnParams",
-                        "context_relevancy": "#/components/schemas/ContextRelevancyScoringFnParams"
-                    }
-                }
-            },
-            "SubsetOfcoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "subset_of",
-                        "default": "subset_of"
-                    },
-                    "subset_of": {
-                        "type": "object",
-                        "properties": {
-                            "aggregation_functions": {
-                                "type": "array",
-                                "items": {
-                                    "type": "string",
-                                    "enum": [
-                                        "average",
-                                        "median",
-                                        "categorical_count",
-                                        "accuracy"
-                                    ],
-                                    "title": "AggregationFunctionType",
-                                    "description": "A type of aggregation function."
-                                },
-                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "title": "BasicScoringFnParamsFields"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "subset_of"
-                ],
-                "title": "SubsetOfcoringFnParams"
+                "title": "Job"
             },
             "EvaluateRowsRequest": {
                 "type": "object",
                 "properties": {
-                    "input_rows": {
+                    "dataset_rows": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -7114,23 +6481,23 @@
                         },
                         "description": "The rows to evaluate."
                     },
-                    "scoring_functions": {
+                    "scoring_fn_ids": {
                         "type": "array",
                         "items": {
                             "type": "string"
                         },
-                        "description": "The scoring functions to use for the evaluation."
+                        "description": "The scoring function ids to use for the evaluation."
                     },
-                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
-                        "description": "The configuration for the benchmark."
+                    "candidate": {
+                        "$ref": "#/components/schemas/EvalCandidate",
+                        "description": "The candidate to evaluate on."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "input_rows",
-                    "scoring_functions",
-                    "benchmark_config"
+                    "dataset_rows",
+                    "scoring_fn_ids",
+                    "candidate"
                 ],
                 "title": "EvaluateRowsRequest"
             },
@@ -7731,6 +7098,526 @@
                 "title": "PaginatedRowsResult",
                 "description": "A paginated list of rows from a dataset."
             },
+            "AnswerCorrectnessScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "answer_correctness",
+                        "default": "answer_correctness"
+                    },
+                    "answer_correctness": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "answer_correctness"
+                ],
+                "title": "AnswerCorrectnessScoringFn"
+            },
+            "AnswerRelevancyScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "answer_relevancy",
+                        "default": "answer_relevancy"
+                    },
+                    "answer_relevancy": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "answer_relevancy"
+                ],
+                "title": "AnswerRelevancyScoringFn"
+            },
+            "AnswerSimilarityScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "answer_similarity",
+                        "default": "answer_similarity"
+                    },
+                    "answer_similarity": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "answer_similarity"
+                ],
+                "title": "AnswerSimilarityScoringFn"
+            },
+            "ContextEntityRecallScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "context_entity_recall",
+                        "default": "context_entity_recall"
+                    },
+                    "context_entity_recall": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "context_entity_recall"
+                ],
+                "title": "ContextEntityRecallScoringFn"
+            },
+            "ContextPrecisionScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "context_precision",
+                        "default": "context_precision"
+                    },
+                    "context_precision": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "context_precision"
+                ],
+                "title": "ContextPrecisionScoringFn"
+            },
+            "ContextRecallScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "context_recall",
+                        "default": "context_recall"
+                    },
+                    "context_recall": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "context_recall"
+                ],
+                "title": "ContextRecallScoringFn"
+            },
+            "ContextRelevancyScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "context_relevancy",
+                        "default": "context_relevancy"
+                    },
+                    "context_relevancy": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "context_relevancy"
+                ],
+                "title": "ContextRelevancyScoringFn"
+            },
+            "CustomLLMAsJudgeScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "custom_llm_as_judge",
+                        "default": "custom_llm_as_judge"
+                    },
+                    "custom_llm_as_judge": {
+                        "type": "object",
+                        "properties": {
+                            "type": {
+                                "type": "string",
+                                "const": "custom_llm_as_judge",
+                                "default": "custom_llm_as_judge"
+                            },
+                            "judge_model": {
+                                "type": "string"
+                            },
+                            "prompt_template": {
+                                "type": "string"
+                            },
+                            "judge_score_regexes": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "type",
+                            "judge_model"
+                        ],
+                        "title": "CustomLLMAsJudgeScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "custom_llm_as_judge"
+                ],
+                "title": "CustomLLMAsJudgeScoringFn"
+            },
+            "EqualityScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "equality",
+                        "default": "equality"
+                    },
+                    "equality": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "equality"
+                ],
+                "title": "EqualityScoringFn"
+            },
+            "FactualityScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "factuality",
+                        "default": "factuality"
+                    },
+                    "factuality": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "factuality"
+                ],
+                "title": "FactualityScoringFn"
+            },
+            "FaithfulnessScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "faithfulness",
+                        "default": "faithfulness"
+                    },
+                    "faithfulness": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "faithfulness"
+                ],
+                "title": "FaithfulnessScoringFn"
+            },
+            "RegexParserMathScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "regex_parser_math_response",
+                        "default": "regex_parser_math_response"
+                    },
+                    "regex_parser_math_response": {
+                        "type": "object",
+                        "properties": {
+                            "parsing_regexes": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                },
+                                "description": "(Optional) Regexes to extract the answer from generated response."
+                            },
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "parsing_regexes"
+                        ],
+                        "title": "RegexParserScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "regex_parser_math_response"
+                ],
+                "title": "RegexParserMathScoringFn"
+            },
+            "RegexParserScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "regex_parser",
+                        "default": "regex_parser"
+                    },
+                    "regex_parser": {
+                        "type": "object",
+                        "properties": {
+                            "parsing_regexes": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                },
+                                "description": "(Optional) Regexes to extract the answer from generated response."
+                            },
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "parsing_regexes"
+                        ],
+                        "title": "RegexParserScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "regex_parser"
+                ],
+                "title": "RegexParserScoringFn"
+            },
             "ScoringFn": {
                 "type": "object",
                 "properties": {
@@ -7749,7 +7636,7 @@
                         "default": "scoring_function"
                     },
                     "fn": {
-                        "$ref": "#/components/schemas/ScoringFnParams",
+                        "$ref": "#/components/schemas/ScoringFnDefinition",
                         "description": "The scoring function type and parameters."
                     },
                     "metadata": {
@@ -7790,6 +7677,109 @@
                 ],
                 "title": "ScoringFn"
             },
+            "ScoringFnDefinition": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/CustomLLMAsJudgeScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/RegexParserScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/RegexParserMathScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/EqualityScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/SubsetOfScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/FactualityScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/FaithfulnessScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AnswerCorrectnessScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AnswerRelevancyScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AnswerSimilarityScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ContextEntityRecallScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ContextPrecisionScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ContextRecallScoringFn"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ContextRelevancyScoringFn"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "custom_llm_as_judge": "#/components/schemas/CustomLLMAsJudgeScoringFn",
+                        "regex_parser": "#/components/schemas/RegexParserScoringFn",
+                        "regex_parser_math_response": "#/components/schemas/RegexParserMathScoringFn",
+                        "equality": "#/components/schemas/EqualityScoringFn",
+                        "subset_of": "#/components/schemas/SubsetOfScoringFn",
+                        "factuality": "#/components/schemas/FactualityScoringFn",
+                        "faithfulness": "#/components/schemas/FaithfulnessScoringFn",
+                        "answer_correctness": "#/components/schemas/AnswerCorrectnessScoringFn",
+                        "answer_relevancy": "#/components/schemas/AnswerRelevancyScoringFn",
+                        "answer_similarity": "#/components/schemas/AnswerSimilarityScoringFn",
+                        "context_entity_recall": "#/components/schemas/ContextEntityRecallScoringFn",
+                        "context_precision": "#/components/schemas/ContextPrecisionScoringFn",
+                        "context_recall": "#/components/schemas/ContextRecallScoringFn",
+                        "context_relevancy": "#/components/schemas/ContextRelevancyScoringFn"
+                    }
+                }
+            },
+            "SubsetOfScoringFn": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "subset_of",
+                        "default": "subset_of"
+                    },
+                    "subset_of": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                },
+                                "description": "(Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed."
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "BasicScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "subset_of"
+                ],
+                "title": "SubsetOfScoringFn"
+            },
             "Shield": {
                 "type": "object",
                 "properties": {
@@ -9992,7 +9982,7 @@
                 "type": "object",
                 "properties": {
                     "fn": {
-                        "$ref": "#/components/schemas/ScoringFnParams",
+                        "$ref": "#/components/schemas/ScoringFnDefinition",
                         "description": "The type and parameters for the scoring function."
                     },
                     "scoring_fn_id": {
@@ -10168,33 +10158,6 @@
                 ],
                 "title": "ResumeAgentTurnRequest"
             },
-            "RunEvalRequest": {
-                "type": "object",
-                "properties": {
-                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
-                        "description": "The configuration for the benchmark."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "benchmark_config"
-                ],
-                "title": "RunEvalRequest"
-            },
-            "Job": {
-                "type": "object",
-                "properties": {
-                    "job_id": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_id"
-                ],
-                "title": "Job"
-            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -10284,7 +10247,7 @@
             "ScoreRequest": {
                 "type": "object",
                 "properties": {
-                    "input_rows": {
+                    "dataset_rows": {
                         "type": "array",
                         "items": {
                             "type": "object",
@@ -10313,25 +10276,18 @@
                         },
                         "description": "The rows to score."
                     },
-                    "scoring_functions": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/ScoringFnParams"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
+                    "scoring_fn_ids": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
                         },
-                        "description": "The scoring functions to use for the scoring."
+                        "description": "The scoring function ids to use for the scoring."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "input_rows",
-                    "scoring_functions"
+                    "dataset_rows",
+                    "scoring_fn_ids"
                 ],
                 "title": "ScoreRequest"
             },
@@ -10353,36 +10309,25 @@
                 "title": "ScoreResponse",
                 "description": "The response from scoring."
             },
-            "ScoreBatchRequest": {
+            "ScoreDatasetRequest": {
                 "type": "object",
                 "properties": {
                     "dataset_id": {
                         "type": "string"
                     },
-                    "scoring_functions": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/ScoringFnParams"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
+                    "scoring_fn_ids": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
                         }
-                    },
-                    "save_results_dataset": {
-                        "type": "boolean"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "dataset_id",
-                    "scoring_functions",
-                    "save_results_dataset"
+                    "scoring_fn_ids"
                 ],
-                "title": "ScoreBatchRequest"
+                "title": "ScoreDatasetRequest"
             },
             "ScoreBatchResponse": {
                 "type": "object",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 5b99ba5aa..310b77eb1 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -666,7 +666,44 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/benchmarks/{benchmark_id}/evaluations:
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: >-
+            The job that was created to run the evaluation.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Eval
+      description: Run an evaluation on a benchmark.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/EvaluateBenchmarkRequest'
+        required: true
+  /v1/eval/rows:
     post:
       responses:
         '200':
@@ -688,15 +725,8 @@ paths:
           $ref: '#/components/responses/DefaultError'
       tags:
         - Eval
-      description: Evaluate a list of rows on a benchmark.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
+      description: Evaluate a list of rows on a candidate.
+      parameters: []
       requestBody:
         content:
           application/json:
@@ -2377,43 +2407,6 @@ paths:
             schema:
               $ref: '#/components/schemas/ResumeAgentTurnRequest'
         required: true
-  /v1/eval/benchmarks/{benchmark_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: >-
-            The job that was created to run the evaluation.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Run an evaluation on a benchmark.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalRequest'
-        required: true
   /v1/safety/run-shield:
     post:
       responses:
@@ -2525,7 +2518,7 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
+              $ref: '#/components/schemas/ScoreDatasetRequest'
         required: true
   /v1/post-training/supervised-fine-tune:
     post:
@@ -4448,311 +4441,6 @@ components:
         - config
       title: AgentCandidate
       description: An agent candidate for evaluation.
-    AnswerCorrectnessScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: answer_correctness
-          default: answer_correctness
-        answer_correctness:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - answer_correctness
-      title: AnswerCorrectnessScoringFnParams
-    AnswerRelevancyScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: answer_relevancy
-          default: answer_relevancy
-        answer_relevancy:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - answer_relevancy
-      title: AnswerRelevancyScoringFnParams
-    AnswerSimilarityScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: answer_similarity
-          default: answer_similarity
-        answer_similarity:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - answer_similarity
-      title: AnswerSimilarityScoringFnParams
-    BenchmarkConfig:
-      type: object
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-          description: The candidate to evaluate.
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-          description: >-
-            Map between scoring function id and parameters for each scoring function
-            you want to run
-        num_examples:
-          type: integer
-          description: >-
-            (Optional) The number of examples to evaluate. If not provided, all examples
-            in the dataset will be evaluated
-      additionalProperties: false
-      required:
-        - eval_candidate
-        - scoring_params
-      title: BenchmarkConfig
-      description: >-
-        A benchmark configuration for evaluation.
-    ContextEntityRecallScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: context_entity_recall
-          default: context_entity_recall
-        context_entity_recall:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - context_entity_recall
-      title: ContextEntityRecallScoringFnParams
-    ContextPrecisionScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: context_precision
-          default: context_precision
-        context_precision:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - context_precision
-      title: ContextPrecisionScoringFnParams
-    ContextRecallScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: context_recall
-          default: context_recall
-        context_recall:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - context_recall
-      title: ContextRecallScoringFnParams
-    ContextRelevancyScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: context_relevancy
-          default: context_relevancy
-        context_relevancy:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - context_relevancy
-      title: ContextRelevancyScoringFnParams
-    CustomLLMAsJudgeScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: custom_llm_as_judge
-          default: custom_llm_as_judge
-        custom_llm_as_judge:
-          type: object
-          properties:
-            type:
-              type: string
-              const: custom_llm_as_judge
-              default: custom_llm_as_judge
-            judge_model:
-              type: string
-            prompt_template:
-              type: string
-            judge_score_regexes:
-              type: array
-              items:
-                type: string
-          additionalProperties: false
-          required:
-            - type
-            - judge_model
-          title: CustomLLMAsJudgeScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - custom_llm_as_judge
-      title: CustomLLMAsJudgeScoringFnParams
-    EqualityScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: equality
-          default: equality
-        equality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - equality
-      title: EqualityScoringFnParams
     EvalCandidate:
       oneOf:
         - $ref: '#/components/schemas/ModelCandidate'
@@ -4762,68 +4450,6 @@ components:
         mapping:
           model: '#/components/schemas/ModelCandidate'
           agent: '#/components/schemas/AgentCandidate'
-    FactualityScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: factuality
-          default: factuality
-        factuality:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - factuality
-      title: FactualityScoringFnParams
-    FaithfulnessScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: faithfulness
-          default: faithfulness
-        faithfulness:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - faithfulness
-      title: FaithfulnessScoringFnParams
     ModelCandidate:
       type: object
       properties:
@@ -4849,152 +4475,29 @@ components:
         - sampling_params
       title: ModelCandidate
       description: A model candidate for evaluation.
-    RegexParserMathScoringFnParams:
+    EvaluateBenchmarkRequest:
       type: object
       properties:
-        type:
-          type: string
-          const: regex_parser_math_response
-          default: regex_parser_math_response
-        regex_parser_math_response:
-          type: object
-          properties:
-            parsing_regexes:
-              type: array
-              items:
-                type: string
-              description: >-
-                (Optional) Regexes to extract the answer from generated response.
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          required:
-            - parsing_regexes
-          title: RegexParserScoringFnParamsFields
+        candidate:
+          $ref: '#/components/schemas/EvalCandidate'
+          description: The candidate to evaluate on.
       additionalProperties: false
       required:
-        - type
-        - regex_parser_math_response
-      title: RegexParserMathScoringFnParams
-    RegexParserScoringFnParams:
+        - candidate
+      title: EvaluateBenchmarkRequest
+    Job:
       type: object
       properties:
-        type:
+        job_id:
           type: string
-          const: regex_parser
-          default: regex_parser
-        regex_parser:
-          type: object
-          properties:
-            parsing_regexes:
-              type: array
-              items:
-                type: string
-              description: >-
-                (Optional) Regexes to extract the answer from generated response.
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          required:
-            - parsing_regexes
-          title: RegexParserScoringFnParamsFields
       additionalProperties: false
       required:
-        - type
-        - regex_parser
-      title: RegexParserScoringFnParams
-    ScoringFnParams:
-      oneOf:
-        - $ref: '#/components/schemas/CustomLLMAsJudgeScoringFnParams'
-        - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        - $ref: '#/components/schemas/RegexParserMathScoringFnParams'
-        - $ref: '#/components/schemas/EqualityScoringFnParams'
-        - $ref: '#/components/schemas/SubsetOfcoringFnParams'
-        - $ref: '#/components/schemas/FactualityScoringFnParams'
-        - $ref: '#/components/schemas/FaithfulnessScoringFnParams'
-        - $ref: '#/components/schemas/AnswerCorrectnessScoringFnParams'
-        - $ref: '#/components/schemas/AnswerRelevancyScoringFnParams'
-        - $ref: '#/components/schemas/AnswerSimilarityScoringFnParams'
-        - $ref: '#/components/schemas/ContextEntityRecallScoringFnParams'
-        - $ref: '#/components/schemas/ContextPrecisionScoringFnParams'
-        - $ref: '#/components/schemas/ContextRecallScoringFnParams'
-        - $ref: '#/components/schemas/ContextRelevancyScoringFnParams'
-      discriminator:
-        propertyName: type
-        mapping:
-          custom_llm_as_judge: '#/components/schemas/CustomLLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-          regex_parser_math_response: '#/components/schemas/RegexParserMathScoringFnParams'
-          equality: '#/components/schemas/EqualityScoringFnParams'
-          subset_of: '#/components/schemas/SubsetOfcoringFnParams'
-          factuality: '#/components/schemas/FactualityScoringFnParams'
-          faithfulness: '#/components/schemas/FaithfulnessScoringFnParams'
-          answer_correctness: '#/components/schemas/AnswerCorrectnessScoringFnParams'
-          answer_relevancy: '#/components/schemas/AnswerRelevancyScoringFnParams'
-          answer_similarity: '#/components/schemas/AnswerSimilarityScoringFnParams'
-          context_entity_recall: '#/components/schemas/ContextEntityRecallScoringFnParams'
-          context_precision: '#/components/schemas/ContextPrecisionScoringFnParams'
-          context_recall: '#/components/schemas/ContextRecallScoringFnParams'
-          context_relevancy: '#/components/schemas/ContextRelevancyScoringFnParams'
-    SubsetOfcoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: subset_of
-          default: subset_of
-        subset_of:
-          type: object
-          properties:
-            aggregation_functions:
-              type: array
-              items:
-                type: string
-                enum:
-                  - average
-                  - median
-                  - categorical_count
-                  - accuracy
-                title: AggregationFunctionType
-                description: A type of aggregation function.
-              description: >-
-                (Optional) Aggregation functions to apply to the scores of each row.
-                If not provided, no aggregation will be performed.
-          additionalProperties: false
-          title: BasicScoringFnParamsFields
-      additionalProperties: false
-      required:
-        - type
-        - subset_of
-      title: SubsetOfcoringFnParams
+        - job_id
+      title: Job
     EvaluateRowsRequest:
       type: object
       properties:
-        input_rows:
+        dataset_rows:
           type: array
           items:
             type: object
@@ -5007,20 +4510,20 @@ components:
                 - type: array
                 - type: object
           description: The rows to evaluate.
-        scoring_functions:
+        scoring_fn_ids:
           type: array
           items:
             type: string
           description: >-
-            The scoring functions to use for the evaluation.
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark.
+            The scoring function ids to use for the evaluation.
+        candidate:
+          $ref: '#/components/schemas/EvalCandidate'
+          description: The candidate to evaluate on.
       additionalProperties: false
       required:
-        - input_rows
-        - scoring_functions
-        - benchmark_config
+        - dataset_rows
+        - scoring_fn_ids
+        - candidate
       title: EvaluateRowsRequest
     EvaluateResponse:
       type: object
@@ -5393,6 +4896,426 @@ components:
         - total_count
       title: PaginatedRowsResult
       description: A paginated list of rows from a dataset.
+    AnswerCorrectnessScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: answer_correctness
+          default: answer_correctness
+        answer_correctness:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - answer_correctness
+      title: AnswerCorrectnessScoringFn
+    AnswerRelevancyScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: answer_relevancy
+          default: answer_relevancy
+        answer_relevancy:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - answer_relevancy
+      title: AnswerRelevancyScoringFn
+    AnswerSimilarityScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: answer_similarity
+          default: answer_similarity
+        answer_similarity:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - answer_similarity
+      title: AnswerSimilarityScoringFn
+    ContextEntityRecallScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: context_entity_recall
+          default: context_entity_recall
+        context_entity_recall:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - context_entity_recall
+      title: ContextEntityRecallScoringFn
+    ContextPrecisionScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: context_precision
+          default: context_precision
+        context_precision:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - context_precision
+      title: ContextPrecisionScoringFn
+    ContextRecallScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: context_recall
+          default: context_recall
+        context_recall:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - context_recall
+      title: ContextRecallScoringFn
+    ContextRelevancyScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: context_relevancy
+          default: context_relevancy
+        context_relevancy:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - context_relevancy
+      title: ContextRelevancyScoringFn
+    CustomLLMAsJudgeScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: custom_llm_as_judge
+          default: custom_llm_as_judge
+        custom_llm_as_judge:
+          type: object
+          properties:
+            type:
+              type: string
+              const: custom_llm_as_judge
+              default: custom_llm_as_judge
+            judge_model:
+              type: string
+            prompt_template:
+              type: string
+            judge_score_regexes:
+              type: array
+              items:
+                type: string
+          additionalProperties: false
+          required:
+            - type
+            - judge_model
+          title: CustomLLMAsJudgeScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - custom_llm_as_judge
+      title: CustomLLMAsJudgeScoringFn
+    EqualityScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: equality
+          default: equality
+        equality:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - equality
+      title: EqualityScoringFn
+    FactualityScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: factuality
+          default: factuality
+        factuality:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - factuality
+      title: FactualityScoringFn
+    FaithfulnessScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: faithfulness
+          default: faithfulness
+        faithfulness:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - faithfulness
+      title: FaithfulnessScoringFn
+    RegexParserMathScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: regex_parser_math_response
+          default: regex_parser_math_response
+        regex_parser_math_response:
+          type: object
+          properties:
+            parsing_regexes:
+              type: array
+              items:
+                type: string
+              description: >-
+                (Optional) Regexes to extract the answer from generated response.
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          required:
+            - parsing_regexes
+          title: RegexParserScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - regex_parser_math_response
+      title: RegexParserMathScoringFn
+    RegexParserScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: regex_parser
+          default: regex_parser
+        regex_parser:
+          type: object
+          properties:
+            parsing_regexes:
+              type: array
+              items:
+                type: string
+              description: >-
+                (Optional) Regexes to extract the answer from generated response.
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          required:
+            - parsing_regexes
+          title: RegexParserScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - regex_parser
+      title: RegexParserScoringFn
     ScoringFn:
       type: object
       properties:
@@ -5407,7 +5330,7 @@ components:
           const: scoring_function
           default: scoring_function
         fn:
-          $ref: '#/components/schemas/ScoringFnParams'
+          $ref: '#/components/schemas/ScoringFnDefinition'
           description: >-
             The scoring function type and parameters.
         metadata:
@@ -5431,6 +5354,70 @@ components:
         - fn
         - metadata
       title: ScoringFn
+    ScoringFnDefinition:
+      oneOf:
+        - $ref: '#/components/schemas/CustomLLMAsJudgeScoringFn'
+        - $ref: '#/components/schemas/RegexParserScoringFn'
+        - $ref: '#/components/schemas/RegexParserMathScoringFn'
+        - $ref: '#/components/schemas/EqualityScoringFn'
+        - $ref: '#/components/schemas/SubsetOfScoringFn'
+        - $ref: '#/components/schemas/FactualityScoringFn'
+        - $ref: '#/components/schemas/FaithfulnessScoringFn'
+        - $ref: '#/components/schemas/AnswerCorrectnessScoringFn'
+        - $ref: '#/components/schemas/AnswerRelevancyScoringFn'
+        - $ref: '#/components/schemas/AnswerSimilarityScoringFn'
+        - $ref: '#/components/schemas/ContextEntityRecallScoringFn'
+        - $ref: '#/components/schemas/ContextPrecisionScoringFn'
+        - $ref: '#/components/schemas/ContextRecallScoringFn'
+        - $ref: '#/components/schemas/ContextRelevancyScoringFn'
+      discriminator:
+        propertyName: type
+        mapping:
+          custom_llm_as_judge: '#/components/schemas/CustomLLMAsJudgeScoringFn'
+          regex_parser: '#/components/schemas/RegexParserScoringFn'
+          regex_parser_math_response: '#/components/schemas/RegexParserMathScoringFn'
+          equality: '#/components/schemas/EqualityScoringFn'
+          subset_of: '#/components/schemas/SubsetOfScoringFn'
+          factuality: '#/components/schemas/FactualityScoringFn'
+          faithfulness: '#/components/schemas/FaithfulnessScoringFn'
+          answer_correctness: '#/components/schemas/AnswerCorrectnessScoringFn'
+          answer_relevancy: '#/components/schemas/AnswerRelevancyScoringFn'
+          answer_similarity: '#/components/schemas/AnswerSimilarityScoringFn'
+          context_entity_recall: '#/components/schemas/ContextEntityRecallScoringFn'
+          context_precision: '#/components/schemas/ContextPrecisionScoringFn'
+          context_recall: '#/components/schemas/ContextRecallScoringFn'
+          context_relevancy: '#/components/schemas/ContextRelevancyScoringFn'
+    SubsetOfScoringFn:
+      type: object
+      properties:
+        type:
+          type: string
+          const: subset_of
+          default: subset_of
+        subset_of:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+              description: >-
+                (Optional) Aggregation functions to apply to the scores of each row.
+                If not provided, no aggregation will be performed.
+          additionalProperties: false
+          title: BasicScoringFnParams
+      additionalProperties: false
+      required:
+        - type
+        - subset_of
+      title: SubsetOfScoringFn
     Shield:
       type: object
       properties:
@@ -6853,7 +6840,7 @@ components:
       type: object
       properties:
         fn:
-          $ref: '#/components/schemas/ScoringFnParams'
+          $ref: '#/components/schemas/ScoringFnDefinition'
           description: >-
             The type and parameters for the scoring function.
         scoring_fn_id:
@@ -6959,25 +6946,6 @@ components:
       required:
         - tool_responses
       title: ResumeAgentTurnRequest
-    RunEvalRequest:
-      type: object
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark.
-      additionalProperties: false
-      required:
-        - benchmark_config
-      title: RunEvalRequest
-    Job:
-      type: object
-      properties:
-        job_id:
-          type: string
-      additionalProperties: false
-      required:
-        - job_id
-      title: Job
     RunShieldRequest:
       type: object
       properties:
@@ -7034,7 +7002,7 @@ components:
     ScoreRequest:
       type: object
       properties:
-        input_rows:
+        dataset_rows:
           type: array
           items:
             type: object
@@ -7047,18 +7015,16 @@ components:
                 - type: array
                 - type: object
           description: The rows to score.
-        scoring_functions:
-          type: object
-          additionalProperties:
-            oneOf:
-              - $ref: '#/components/schemas/ScoringFnParams'
-              - type: 'null'
+        scoring_fn_ids:
+          type: array
+          items:
+            type: string
           description: >-
-            The scoring functions to use for the scoring.
+            The scoring function ids to use for the scoring.
       additionalProperties: false
       required:
-        - input_rows
-        - scoring_functions
+        - dataset_rows
+        - scoring_fn_ids
       title: ScoreRequest
     ScoreResponse:
       type: object
@@ -7074,25 +7040,20 @@ components:
         - results
       title: ScoreResponse
       description: The response from scoring.
-    ScoreBatchRequest:
+    ScoreDatasetRequest:
       type: object
       properties:
         dataset_id:
           type: string
-        scoring_functions:
-          type: object
-          additionalProperties:
-            oneOf:
-              - $ref: '#/components/schemas/ScoringFnParams'
-              - type: 'null'
-        save_results_dataset:
-          type: boolean
+        scoring_fn_ids:
+          type: array
+          items:
+            type: string
       additionalProperties: false
       required:
         - dataset_id
-        - scoring_functions
-        - save_results_dataset
-      title: ScoreBatchRequest
+        - scoring_fn_ids
+      title: ScoreDatasetRequest
     ScoreBatchResponse:
       type: object
       properties:
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 01fc873e6..45edd3d6b 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkab
 from pydantic import BaseModel, Field
 
 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.schema_utils import json_schema_type, webmethod
 
 
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index dec018d83..552afe0a2 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -13,7 +13,6 @@ from llama_stack.apis.agents import AgentConfig
 from llama_stack.apis.common.job_types import Job, JobStatus
 from llama_stack.apis.inference import SamplingParams, SystemMessage
 from llama_stack.apis.scoring import ScoringResult
-from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 
 
@@ -49,27 +48,6 @@ EvalCandidate = register_schema(
 )
 
 
-@json_schema_type
-class BenchmarkConfig(BaseModel):
-    """A benchmark configuration for evaluation.
-
-    :param eval_candidate: The candidate to evaluate.
-    :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
-    :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
-    """
-
-    eval_candidate: EvalCandidate
-    scoring_params: Dict[str, ScoringFnParams] = Field(
-        description="Map between scoring function id and parameters for each scoring function you want to run",
-        default_factory=dict,
-    )
-    num_examples: Optional[int] = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
-    # we could optinally add any specific dataset config here
-
-
 @json_schema_type
 class EvaluateResponse(BaseModel):
     """The response from an evaluation.
@@ -87,32 +65,30 @@ class Eval(Protocol):
     """Llama Stack Evaluation API for running evaluations on model and agent candidates."""
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
-    async def run_eval(
+    async def evaluate_benchmark(
         self,
         benchmark_id: str,
-        benchmark_config: BenchmarkConfig,
+        candidate: EvalCandidate,
     ) -> Job:
         """Run an evaluation on a benchmark.
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param benchmark_config: The configuration for the benchmark.
+        :param candidate: The candidate to evaluate on.
         :return: The job that was created to run the evaluation.
         """
 
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
+    @webmethod(route="/eval/rows", method="POST")
     async def evaluate_rows(
         self,
-        benchmark_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
-        benchmark_config: BenchmarkConfig,
+        dataset_rows: List[Dict[str, Any]],
+        scoring_fn_ids: List[str],
+        candidate: EvalCandidate,
     ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param input_rows: The rows to evaluate.
-        :param scoring_functions: The scoring functions to use for the evaluation.
-        :param benchmark_config: The configuration for the benchmark.
+        """Evaluate a list of rows on a candidate.
+        
+        :param dataset_rows: The rows to evaluate.
+        :param scoring_fn_ids: The scoring function ids to use for the evaluation.
+        :param candidate: The candidate to evaluate on.
         :return: EvaluateResponse object containing generations and scores
         """
 
diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py
index 54a9ac2aa..eecca7799 100644
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
+from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.schema_utils import json_schema_type, webmethod
 
 # mapping of metric to value
@@ -56,23 +56,22 @@ class Scoring(Protocol):
     scoring_function_store: ScoringFunctionStore
 
     @webmethod(route="/scoring/score-batch", method="POST")
-    async def score_batch(
+    async def score_dataset(
         self,
         dataset_id: str,
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
-        save_results_dataset: bool = False,
+        scoring_fn_ids: List[str],
     ) -> ScoreBatchResponse: ...
 
     @webmethod(route="/scoring/score", method="POST")
     async def score(
         self,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: Dict[str, Optional[ScoringFnParams]],
+        dataset_rows: List[Dict[str, Any]],
+        scoring_fn_ids: List[str],
     ) -> ScoreResponse:
         """Score a list of rows.
 
-        :param input_rows: The rows to score.
-        :param scoring_functions: The scoring functions to use for the scoring.
+        :param dataset_rows: The rows to score.
+        :param scoring_fn_ids: The scoring function ids to use for the scoring.
         :return: ScoreResponse object containing rows and aggregated results
         """
         ...
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index d6ee4f975..0e7ec4354 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -67,7 +67,7 @@ class AggregationFunctionType(Enum):
     accuracy = "accuracy"
 
 
-class BasicScoringFnParamsFields(BaseModel):
+class BasicScoringFnParams(BaseModel):
     """
     :param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed.
     """
@@ -78,7 +78,7 @@ class BasicScoringFnParamsFields(BaseModel):
     )
 
 
-class RegexParserScoringFnParamsFields(BaseModel):
+class RegexParserScoringFnParams(BaseModel):
     """
     :param parsing_regexes: (Optional) Regexes to extract the answer from generated response.
     :param aggregation_functions: (Optional) Aggregation functions to apply to the scores of each row. If not provided, no aggregation will be performed.
@@ -93,7 +93,7 @@ class RegexParserScoringFnParamsFields(BaseModel):
         default_factory=list,
     )
 
-class CustomLLMAsJudgeScoringFnParamsFields(BaseModel):
+class CustomLLMAsJudgeScoringFnParams(BaseModel):
     type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
     judge_model: str
     prompt_template: Optional[str] = None
@@ -103,103 +103,103 @@ class CustomLLMAsJudgeScoringFnParamsFields(BaseModel):
     )
 
 @json_schema_type
-class RegexParserScoringFnParams(BaseModel):
+class RegexParserScoringFn(BaseModel):
     type: Literal["regex_parser"] = "regex_parser"
-    regex_parser: RegexParserScoringFnParamsFields
+    regex_parser: RegexParserScoringFnParams
 
 
 @json_schema_type
-class RegexParserMathScoringFnParams(BaseModel):
+class RegexParserMathScoringFn(BaseModel):
     type: Literal["regex_parser_math_response"] = "regex_parser_math_response"
-    regex_parser_math_response: RegexParserScoringFnParamsFields
+    regex_parser_math_response: RegexParserScoringFnParams
 
 @json_schema_type
-class EqualityScoringFnParams(BaseModel):
+class EqualityScoringFn(BaseModel):
     type: Literal["equality"] = "equality"
-    equality: BasicScoringFnParamsFields
+    equality: BasicScoringFnParams
 
 @json_schema_type
-class SubsetOfcoringFnParams(BaseModel):
+class SubsetOfScoringFn(BaseModel):
     type: Literal["subset_of"] = "subset_of"
-    subset_of: BasicScoringFnParamsFields
+    subset_of: BasicScoringFnParams
 
 @json_schema_type
-class FactualityScoringFnParams(BaseModel):
+class FactualityScoringFn(BaseModel):
     type: Literal["factuality"] = "factuality"
-    factuality: BasicScoringFnParamsFields
+    factuality: BasicScoringFnParams
 
 @json_schema_type
-class FaithfulnessScoringFnParams(BaseModel):
+class FaithfulnessScoringFn(BaseModel):
     type: Literal["faithfulness"] = "faithfulness"
-    faithfulness: BasicScoringFnParamsFields
+    faithfulness: BasicScoringFnParams
 
 @json_schema_type
-class AnswerCorrectnessScoringFnParams(BaseModel):
+class AnswerCorrectnessScoringFn(BaseModel):
     type: Literal["answer_correctness"] = "answer_correctness"
-    answer_correctness: BasicScoringFnParamsFields
+    answer_correctness: BasicScoringFnParams
 
 @json_schema_type
-class AnswerRelevancyScoringFnParams(BaseModel):
+class AnswerRelevancyScoringFn(BaseModel):
     type: Literal["answer_relevancy"] = "answer_relevancy"
-    answer_relevancy: BasicScoringFnParamsFields
+    answer_relevancy: BasicScoringFnParams
 
 @json_schema_type
-class AnswerSimilarityScoringFnParams(BaseModel):
+class AnswerSimilarityScoringFn(BaseModel):
     type: Literal["answer_similarity"] = "answer_similarity"
-    answer_similarity: BasicScoringFnParamsFields
+    answer_similarity: BasicScoringFnParams
 
 
 @json_schema_type
-class ContextEntityRecallScoringFnParams(BaseModel):
+class ContextEntityRecallScoringFn(BaseModel):
     type: Literal["context_entity_recall"] = "context_entity_recall"
-    context_entity_recall: BasicScoringFnParamsFields
+    context_entity_recall: BasicScoringFnParams
 
 
 @json_schema_type
-class ContextPrecisionScoringFnParams(BaseModel):
+class ContextPrecisionScoringFn(BaseModel):
     type: Literal["context_precision"] = "context_precision"
-    context_precision: BasicScoringFnParamsFields
+    context_precision: BasicScoringFnParams
 
 
 @json_schema_type
-class ContextRecallScoringFnParams(BaseModel):
+class ContextRecallScoringFn(BaseModel):
     type: Literal["context_recall"] = "context_recall"
-    context_recall: BasicScoringFnParamsFields
+    context_recall: BasicScoringFnParams
 
 
 @json_schema_type
-class ContextRelevancyScoringFnParams(BaseModel):
+class ContextRelevancyScoringFn(BaseModel):
     type: Literal["context_relevancy"] = "context_relevancy"
-    context_relevancy: BasicScoringFnParamsFields
+    context_relevancy: BasicScoringFnParams
 
 
 @json_schema_type
-class CustomLLMAsJudgeScoringFnParams(BaseModel):
+class CustomLLMAsJudgeScoringFn(BaseModel):
     type: Literal["custom_llm_as_judge"] = "custom_llm_as_judge"
-    custom_llm_as_judge: CustomLLMAsJudgeScoringFnParamsFields
+    custom_llm_as_judge: CustomLLMAsJudgeScoringFnParams
 
 
-ScoringFnParams = register_schema(
+ScoringFnDefinition = register_schema(
     Annotated[
         Union[
-            CustomLLMAsJudgeScoringFnParams,
-            RegexParserScoringFnParams,
-            RegexParserMathScoringFnParams,
-            EqualityScoringFnParams,
-            SubsetOfcoringFnParams,
-            FactualityScoringFnParams,
-            FaithfulnessScoringFnParams,
-            AnswerCorrectnessScoringFnParams,
-            AnswerRelevancyScoringFnParams,
-            AnswerSimilarityScoringFnParams,
-            ContextEntityRecallScoringFnParams,
-            ContextPrecisionScoringFnParams,
-            ContextRecallScoringFnParams,
-            ContextRelevancyScoringFnParams,
+            CustomLLMAsJudgeScoringFn,
+            RegexParserScoringFn,
+            RegexParserMathScoringFn,
+            EqualityScoringFn,
+            SubsetOfScoringFn,
+            FactualityScoringFn,
+            FaithfulnessScoringFn,
+            AnswerCorrectnessScoringFn,
+            AnswerRelevancyScoringFn,
+            AnswerSimilarityScoringFn,
+            ContextEntityRecallScoringFn,
+            ContextPrecisionScoringFn,
+            ContextRecallScoringFn,
+            ContextRelevancyScoringFn,
         ],
         Field(discriminator="type"),
     ],
-    name="ScoringFnParams",
+    name="ScoringFnDefinition",
 )
 
 
@@ -208,7 +208,7 @@ class CommonScoringFnFields(BaseModel):
     :param fn: The scoring function type and parameters. 
     :param metadata: (Optional) Any additional metadata for this definition (e.g. description).
     """
-    fn: ScoringFnParams
+    fn: ScoringFnDefinition
     metadata: Dict[str, Any] = Field(
         default_factory=dict,
         description="Any additional metadata for this definition (e.g. description)",
@@ -288,7 +288,7 @@ class ScoringFunctions(Protocol):
     @webmethod(route="/scoring-functions", method="POST")
     async def register_scoring_function(
         self,
-        fn: ScoringFnParams,
+        fn: ScoringFnDefinition,
         scoring_fn_id: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
     ) -> ScoringFn: