From a69759613a0f024b54dbe97229d3de7cac5109c5 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 18 Mar 2025 15:01:41 -0700
Subject: [PATCH] comments

---
 docs/_static/llama-stack-spec.html        | 1670 ++++++++++++++++++---
 docs/_static/llama-stack-spec.yaml        | 1134 ++++++++++++--
 llama_stack/apis/benchmarks/benchmarks.py |    9 +-
 llama_stack/apis/common/job_types.py      |    7 +-
 llama_stack/apis/evaluation/evaluation.py |   14 +-
 llama_stack/apis/graders/graders.py       |   11 +-
 llama_stack/distribution/stack.py         |   30 +-
 7 files changed, 2486 insertions(+), 389 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 8de7f86de..cb5959e22 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -866,83 +866,6 @@
                 ]
             }
         },
-        "/v1/graders/{grader_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "The grader.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/Grader"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Graders"
-                ],
-                "description": "Get a grader by ID.",
-                "parameters": [
-                    {
-                        "name": "grader_id",
-                        "in": "path",
-                        "description": "The ID of the grader.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            },
-            "delete": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Graders"
-                ],
-                "description": "Delete a grader by ID.",
-                "parameters": [
-                    {
-                        "name": "grader_id",
-                        "in": "path",
-                        "description": "The ID of the grader.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/v1/inference/embeddings": {
             "post": {
                 "responses": {
@@ -986,6 +909,59 @@
                 }
             }
         },
+        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "EvaluateResponse object containing generations and scores",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "Evaluate a list of rows on a benchmark.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EvaluateRowsRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
             "get": {
                 "responses": {
@@ -1158,6 +1134,39 @@
                         }
                     }
                 ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "Unregister a benchmark by ID.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
             }
         },
         "/v1/datasets/{dataset_id}": {
@@ -1235,6 +1244,83 @@
                 ]
             }
         },
+        "/v1/graders/{grader_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The grader.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Grader"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Graders"
+                ],
+                "description": "Get a grader by ID.",
+                "parameters": [
+                    {
+                        "name": "grader_id",
+                        "in": "path",
+                        "description": "The ID of the grader.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Graders"
+                ],
+                "description": "Unregister a grader by ID.",
+                "parameters": [
+                    {
+                        "name": "grader_id",
+                        "in": "path",
+                        "description": "The ID of the grader.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/models/{model_id}": {
             "get": {
                 "responses": {
@@ -1310,6 +1396,48 @@
                 ]
             }
         },
+        "/v1/scoring-functions/{scoring_fn_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ScoringFn"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "ScoringFunctions"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "scoring_fn_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/shields/{identifier}": {
             "get": {
                 "responses": {
@@ -2244,6 +2372,153 @@
                 ]
             }
         },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The status of the evaluationjob.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/JobStatus"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "Get the status of a job.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "description": "The ID of the job to get the status of.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "Cancel a job.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "description": "The ID of the job to cancel.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The result of the job.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "Get the result of a job.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "description": "The ID of the job to get the result of.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/agents/{agent_id}/sessions": {
             "get": {
                 "responses": {
@@ -2517,9 +2792,9 @@
                     "200": {
                         "description": "A list of graders.",
                         "content": {
-                            "application/jsonl": {
+                            "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Grader"
+                                    "$ref": "#/components/schemas/ListGradersResponse"
                                 }
                             }
                         }
@@ -2775,6 +3050,73 @@
                 ]
             }
         },
+        "/v1/scoring-functions": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListScoringFunctionsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "ScoringFunctions"
+                ],
+                "description": "",
+                "parameters": []
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "ScoringFunctions"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RegisterScoringFunctionRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/shields": {
             "get": {
                 "responses": {
@@ -3402,6 +3744,59 @@
                 }
             }
         },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The job that was created to run the evaluation.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Job"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "Run an evaluation on a benchmark.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to run the evaluation on.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunEvalRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/safety/run-shield": {
             "post": {
                 "responses": {
@@ -3524,6 +3919,92 @@
                 }
             }
         },
+        "/v1/scoring/score": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "ScoreResponse object containing rows and aggregated results",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ScoreResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Scoring"
+                ],
+                "description": "Score a list of rows.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/ScoreRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/scoring/score-batch": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ScoreBatchResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Scoring"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/ScoreBatchRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/post-training/supervised-fine-tune": {
             "post": {
                 "responses": {
@@ -6149,6 +6630,381 @@
                 "title": "EmbeddingsResponse",
                 "description": "Response containing generated embeddings."
             },
+            "AgentCandidate": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "agent",
+                        "default": "agent"
+                    },
+                    "config": {
+                        "$ref": "#/components/schemas/AgentConfig",
+                        "description": "The configuration for the agent candidate."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "config"
+                ],
+                "title": "AgentCandidate",
+                "description": "An agent candidate for evaluation."
+            },
+            "AggregationFunctionType": {
+                "type": "string",
+                "enum": [
+                    "average",
+                    "median",
+                    "categorical_count",
+                    "accuracy"
+                ],
+                "title": "AggregationFunctionType"
+            },
+            "BasicScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "basic",
+                        "default": "basic"
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "BasicScoringFnParams"
+            },
+            "BenchmarkConfig": {
+                "type": "object",
+                "properties": {
+                    "eval_candidate": {
+                        "$ref": "#/components/schemas/EvalCandidate",
+                        "description": "The candidate to evaluate."
+                    },
+                    "scoring_params": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringFnParams"
+                        },
+                        "description": "Map between scoring function id and parameters for each scoring function you want to run"
+                    },
+                    "num_examples": {
+                        "type": "integer",
+                        "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "eval_candidate",
+                    "scoring_params"
+                ],
+                "title": "BenchmarkConfig",
+                "description": "A benchmark configuration for evaluation."
+            },
+            "EvalCandidate": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/ModelCandidate"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentCandidate"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "model": "#/components/schemas/ModelCandidate",
+                        "agent": "#/components/schemas/AgentCandidate"
+                    }
+                }
+            },
+            "LLMAsJudgeScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "llm_as_judge",
+                        "default": "llm_as_judge"
+                    },
+                    "judge_model": {
+                        "type": "string"
+                    },
+                    "prompt_template": {
+                        "type": "string"
+                    },
+                    "judge_score_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "judge_model"
+                ],
+                "title": "LLMAsJudgeScoringFnParams"
+            },
+            "ModelCandidate": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "model",
+                        "default": "model"
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The model ID to evaluate."
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "The sampling parameters for the model."
+                    },
+                    "system_message": {
+                        "$ref": "#/components/schemas/SystemMessage",
+                        "description": "(Optional) The system message providing instructions or context to the model."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "model",
+                    "sampling_params"
+                ],
+                "title": "ModelCandidate",
+                "description": "A model candidate for evaluation."
+            },
+            "RegexParserScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "regex_parser",
+                        "default": "regex_parser"
+                    },
+                    "parsing_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "RegexParserScoringFnParams"
+            },
+            "ScoringFnParams": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/BasicScoringFnParams"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
+                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
+                        "basic": "#/components/schemas/BasicScoringFnParams"
+                    }
+                }
+            },
+            "EvaluateRowsRequest": {
+                "type": "object",
+                "properties": {
+                    "input_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The rows to evaluate."
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "description": "The scoring functions to use for the evaluation."
+                    },
+                    "benchmark_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig",
+                        "description": "The configuration for the benchmark."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_rows",
+                    "scoring_functions",
+                    "benchmark_config"
+                ],
+                "title": "EvaluateRowsRequest"
+            },
+            "EvaluateResponse": {
+                "type": "object",
+                "properties": {
+                    "generations": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The generations from the evaluation."
+                    },
+                    "scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        },
+                        "description": "The scores from the evaluation."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "generations",
+                    "scores"
+                ],
+                "title": "EvaluateResponse",
+                "description": "The response from an evaluation."
+            },
+            "ScoringResult": {
+                "type": "object",
+                "properties": {
+                    "score_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The scoring result for each row. Each row is a map of column name to value."
+                    },
+                    "aggregated_results": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        },
+                        "description": "Map of metric name to aggregated value"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "score_rows",
+                    "aggregated_results"
+                ],
+                "title": "ScoringResult",
+                "description": "A scoring result for a single row."
+            },
             "Agent": {
                 "type": "object",
                 "properties": {
@@ -6876,6 +7732,268 @@
                 ],
                 "title": "ModelType"
             },
+            "AgentTurnInputType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "agent_turn_input",
+                        "default": "agent_turn_input"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "AgentTurnInputType"
+            },
+            "ArrayType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "array",
+                        "default": "array"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ArrayType"
+            },
+            "BooleanType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "boolean",
+                        "default": "boolean"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "BooleanType"
+            },
+            "ChatCompletionInputType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "chat_completion_input",
+                        "default": "chat_completion_input"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ChatCompletionInputType"
+            },
+            "CompletionInputType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "completion_input",
+                        "default": "completion_input"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "CompletionInputType"
+            },
+            "JsonType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "json",
+                        "default": "json"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "JsonType"
+            },
+            "NumberType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "number",
+                        "default": "number"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "NumberType"
+            },
+            "ObjectType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "object",
+                        "default": "object"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ObjectType"
+            },
+            "ParamType": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/StringType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/NumberType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/BooleanType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ArrayType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ObjectType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/JsonType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/UnionType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ChatCompletionInputType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/CompletionInputType"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentTurnInputType"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "string": "#/components/schemas/StringType",
+                        "number": "#/components/schemas/NumberType",
+                        "boolean": "#/components/schemas/BooleanType",
+                        "array": "#/components/schemas/ArrayType",
+                        "object": "#/components/schemas/ObjectType",
+                        "json": "#/components/schemas/JsonType",
+                        "union": "#/components/schemas/UnionType",
+                        "chat_completion_input": "#/components/schemas/ChatCompletionInputType",
+                        "completion_input": "#/components/schemas/CompletionInputType",
+                        "agent_turn_input": "#/components/schemas/AgentTurnInputType"
+                    }
+                }
+            },
+            "ScoringFn": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "provider_resource_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "scoring_function",
+                        "default": "scoring_function"
+                    },
+                    "description": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    },
+                    "return_type": {
+                        "$ref": "#/components/schemas/ParamType"
+                    },
+                    "params": {
+                        "$ref": "#/components/schemas/ScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "metadata",
+                    "return_type"
+                ],
+                "title": "ScoringFn"
+            },
+            "StringType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "string",
+                        "default": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "StringType"
+            },
+            "UnionType": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "union",
+                        "default": "union"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "UnionType"
+            },
             "Shield": {
                 "type": "object",
                 "properties": {
@@ -7405,13 +8523,13 @@
                 ],
                 "title": "VectorDB"
             },
-            "BenchmarkTask": {
+            "BenchmarkEvaluationTask": {
                 "type": "object",
                 "properties": {
                     "type": {
                         "type": "string",
-                        "const": "benchmark_id",
-                        "default": "benchmark_id"
+                        "const": "benchmark",
+                        "default": "benchmark"
                     },
                     "benchmark_id": {
                         "type": "string"
@@ -7422,15 +8540,15 @@
                     "type",
                     "benchmark_id"
                 ],
-                "title": "BenchmarkTask"
+                "title": "BenchmarkEvaluationTask"
             },
-            "DataSourceGraderTask": {
+            "DataEvaluationTask": {
                 "type": "object",
                 "properties": {
                     "type": {
                         "type": "string",
-                        "const": "data_source_grader",
-                        "default": "data_source_grader"
+                        "const": "data",
+                        "default": "data"
                     },
                     "data_source": {
                         "$ref": "#/components/schemas/DataSource"
@@ -7448,15 +8566,15 @@
                     "data_source",
                     "grader_ids"
                 ],
-                "title": "DataSourceGraderTask"
+                "title": "DataEvaluationTask"
             },
-            "DatasetGraderTask": {
+            "DatasetEvaluationTask": {
                 "type": "object",
                 "properties": {
                     "type": {
                         "type": "string",
-                        "const": "dataset_grader",
-                        "default": "dataset_grader"
+                        "const": "dataset",
+                        "default": "dataset"
                     },
                     "dataset_id": {
                         "type": "string"
@@ -7474,26 +8592,26 @@
                     "dataset_id",
                     "grader_ids"
                 ],
-                "title": "DatasetGraderTask"
+                "title": "DatasetEvaluationTask"
             },
             "EvaluationTask": {
                 "oneOf": [
                     {
-                        "$ref": "#/components/schemas/BenchmarkTask"
+                        "$ref": "#/components/schemas/BenchmarkEvaluationTask"
                     },
                     {
-                        "$ref": "#/components/schemas/DatasetGraderTask"
+                        "$ref": "#/components/schemas/DatasetEvaluationTask"
                     },
                     {
-                        "$ref": "#/components/schemas/DataSourceGraderTask"
+                        "$ref": "#/components/schemas/DataEvaluationTask"
                     }
                 ],
                 "discriminator": {
                     "propertyName": "type",
                     "mapping": {
-                        "benchmark_id": "#/components/schemas/BenchmarkTask",
-                        "dataset_grader": "#/components/schemas/DatasetGraderTask",
-                        "data_source_grader": "#/components/schemas/DataSourceGraderTask"
+                        "benchmark": "#/components/schemas/BenchmarkEvaluationTask",
+                        "dataset": "#/components/schemas/DatasetEvaluationTask",
+                        "data": "#/components/schemas/DataEvaluationTask"
                     }
                 }
             },
@@ -7511,27 +8629,6 @@
                 ],
                 "title": "GradeRequest"
             },
-            "AgentCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "agent",
-                        "default": "agent"
-                    },
-                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig",
-                        "description": "The configuration for the agent candidate."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "config"
-                ],
-                "title": "AgentCandidate",
-                "description": "An agent candidate for evaluation."
-            },
             "EvaluationCandidate": {
                 "oneOf": [
                     {
@@ -7572,10 +8669,10 @@
                         "format": "date-time",
                         "description": "The time the job was created."
                     },
-                    "ended_at": {
+                    "completed_at": {
                         "type": "string",
                         "format": "date-time",
-                        "description": "The time the job ended."
+                        "description": "The time the job completed."
                     },
                     "error": {
                         "type": "string",
@@ -7604,35 +8701,6 @@
                 ],
                 "title": "EvaluationJob"
             },
-            "ModelCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "model",
-                        "default": "model"
-                    },
-                    "model_id": {
-                        "type": "string"
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "The sampling parameters for the model."
-                    },
-                    "system_message": {
-                        "$ref": "#/components/schemas/SystemMessage",
-                        "description": "(Optional) The system message providing instructions or context to the model."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "model_id",
-                    "sampling_params"
-                ],
-                "title": "ModelCandidate",
-                "description": "A model candidate for evaluation."
-            },
             "GradeSyncRequest": {
                 "type": "object",
                 "properties": {
@@ -7695,73 +8763,6 @@
                 "title": "EvaluationResponse",
                 "description": "A response to an inline evaluation."
             },
-            "ScoringResult": {
-                "type": "object",
-                "properties": {
-                    "scores": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The scoring result for each row. Each row is a map of grader column name to value."
-                    },
-                    "metrics": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "Map of metric name to aggregated value."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "scores",
-                    "metrics"
-                ],
-                "title": "ScoringResult",
-                "description": "A scoring result for a single row."
-            },
             "HealthInfo": {
                 "type": "object",
                 "properties": {
@@ -8094,6 +9095,17 @@
                 "title": "IterrowsResponse",
                 "description": "A paginated list of rows from a dataset."
             },
+            "JobStatus": {
+                "type": "string",
+                "enum": [
+                    "completed",
+                    "in_progress",
+                    "failed",
+                    "scheduled",
+                    "cancelled"
+                ],
+                "title": "JobStatus"
+            },
             "ListAgentSessionsResponse": {
                 "type": "object",
                 "properties": {
@@ -8266,6 +9278,22 @@
                 ],
                 "title": "ListGraderTypesResponse"
             },
+            "ListGradersResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Grader"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListGradersResponse"
+            },
             "ListModelsResponse": {
                 "type": "object",
                 "properties": {
@@ -8338,6 +9366,22 @@
                 ],
                 "title": "ListRoutesResponse"
             },
+            "ListScoringFunctionsResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ScoringFn"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListScoringFunctionsResponse"
+            },
             "ListShieldsResponse": {
                 "type": "object",
                 "properties": {
@@ -9330,7 +10374,7 @@
                 "properties": {
                     "dataset_id": {
                         "type": "string",
-                        "description": "The ID of the dataset to used to run the benchmark."
+                        "description": "The ID of the dataset to be used to run the benchmark."
                     },
                     "grader_ids": {
                         "type": "array",
@@ -9522,6 +10566,36 @@
                 ],
                 "title": "RegisterModelRequest"
             },
+            "RegisterScoringFunctionRequest": {
+                "type": "object",
+                "properties": {
+                    "scoring_fn_id": {
+                        "type": "string"
+                    },
+                    "description": {
+                        "type": "string"
+                    },
+                    "return_type": {
+                        "$ref": "#/components/schemas/ParamType"
+                    },
+                    "provider_scoring_fn_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "params": {
+                        "$ref": "#/components/schemas/ScoringFnParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "scoring_fn_id",
+                    "description",
+                    "return_type"
+                ],
+                "title": "RegisterScoringFunctionRequest"
+            },
             "RegisterShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -9677,6 +10751,33 @@
                 ],
                 "title": "RunRequest"
             },
+            "RunEvalRequest": {
+                "type": "object",
+                "properties": {
+                    "benchmark_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig",
+                        "description": "The configuration for the benchmark."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "benchmark_config"
+                ],
+                "title": "RunEvalRequest"
+            },
+            "Job": {
+                "type": "object",
+                "properties": {
+                    "job_id": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "job_id"
+                ],
+                "title": "Job"
+            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -9782,6 +10883,128 @@
                 ],
                 "title": "SaveSpansToDatasetRequest"
             },
+            "ScoreRequest": {
+                "type": "object",
+                "properties": {
+                    "input_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The rows to score."
+                    },
+                    "scoring_functions": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/ScoringFnParams"
+                                },
+                                {
+                                    "type": "null"
+                                }
+                            ]
+                        },
+                        "description": "The scoring functions to use for the scoring."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_rows",
+                    "scoring_functions"
+                ],
+                "title": "ScoreRequest"
+            },
+            "ScoreResponse": {
+                "type": "object",
+                "properties": {
+                    "results": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        },
+                        "description": "A map of scoring function name to ScoringResult."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "results"
+                ],
+                "title": "ScoreResponse",
+                "description": "The response from scoring."
+            },
+            "ScoreBatchRequest": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "$ref": "#/components/schemas/ScoringFnParams"
+                                },
+                                {
+                                    "type": "null"
+                                }
+                            ]
+                        }
+                    },
+                    "save_results_dataset": {
+                        "type": "boolean"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_id",
+                    "scoring_functions",
+                    "save_results_dataset"
+                ],
+                "title": "ScoreBatchRequest"
+            },
+            "ScoreBatchResponse": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "results": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "results"
+                ],
+                "title": "ScoreBatchResponse"
+            },
             "AlgorithmConfig": {
                 "oneOf": [
                     {
@@ -10144,6 +11367,10 @@
         {
             "name": "Datasets"
         },
+        {
+            "name": "Eval",
+            "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
+        },
         {
             "name": "Evaluation"
         },
@@ -10174,6 +11401,12 @@
         {
             "name": "Safety"
         },
+        {
+            "name": "Scoring"
+        },
+        {
+            "name": "ScoringFunctions"
+        },
         {
             "name": "Shields"
         },
@@ -10205,6 +11438,7 @@
                 "Benchmarks",
                 "DatasetIO",
                 "Datasets",
+                "Eval",
                 "Evaluation",
                 "Files",
                 "Graders",
@@ -10214,6 +11448,8 @@
                 "PostTraining (Coming Soon)",
                 "Providers",
                 "Safety",
+                "Scoring",
+                "ScoringFunctions",
                 "Shields",
                 "SyntheticDataGeneration (Coming Soon)",
                 "Telemetry",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index b0db86389..ecc8104e1 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -589,59 +589,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/graders/{grader_id}:
-    get:
-      responses:
-        '200':
-          description: The grader.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Grader'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Graders
-      description: Get a grader by ID.
-      parameters:
-        - name: grader_id
-          in: path
-          description: The ID of the grader.
-          required: true
-          schema:
-            type: string
-    delete:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Graders
-      description: Delete a grader by ID.
-      parameters:
-        - name: grader_id
-          in: path
-          description: The ID of the grader.
-          required: true
-          schema:
-            type: string
   /v1/inference/embeddings:
     post:
       responses:
@@ -675,6 +622,43 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
+  /v1/eval/benchmarks/{benchmark_id}/evaluations:
+    post:
+      responses:
+        '200':
+          description: >-
+            EvaluateResponse object containing generations and scores
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Eval
+      description: Evaluate a list of rows on a benchmark.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/EvaluateRowsRequest'
+        required: true
   /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
     get:
       responses:
@@ -794,6 +778,29 @@ paths:
           required: true
           schema:
             type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Benchmarks
+      description: Unregister a benchmark by ID.
+      parameters:
+        - name: benchmark_id
+          in: path
+          required: true
+          schema:
+            type: string
   /v1/datasets/{dataset_id}:
     get:
       responses:
@@ -845,6 +852,59 @@ paths:
           required: true
           schema:
             type: string
+  /v1/graders/{grader_id}:
+    get:
+      responses:
+        '200':
+          description: The grader.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Grader'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Graders
+      description: Get a grader by ID.
+      parameters:
+        - name: grader_id
+          in: path
+          description: The ID of the grader.
+          required: true
+          schema:
+            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Graders
+      description: Unregister a grader by ID.
+      parameters:
+        - name: grader_id
+          in: path
+          description: The ID of the grader.
+          required: true
+          schema:
+            type: string
   /v1/models/{model_id}:
     get:
       responses:
@@ -896,6 +956,34 @@ paths:
           required: true
           schema:
             type: string
+  /v1/scoring-functions/{scoring_fn_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ScoringFn'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      description: ''
+      parameters:
+        - name: scoring_fn_id
+          in: path
+          required: true
+          schema:
+            type: string
   /v1/shields/{identifier}:
     get:
       responses:
@@ -1536,6 +1624,109 @@ paths:
           required: false
           schema:
             type: integer
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
+    get:
+      responses:
+        '200':
+          description: The status of the evaluationjob.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/JobStatus'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Eval
+      description: Get the status of a job.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          description: The ID of the job to get the status of.
+          required: true
+          schema:
+            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Eval
+      description: Cancel a job.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          description: The ID of the job to cancel.
+          required: true
+          schema:
+            type: string
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
+    get:
+      responses:
+        '200':
+          description: The result of the job.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Eval
+      description: Get the result of a job.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          description: The ID of the job to get the result of.
+          required: true
+          schema:
+            type: string
   /v1/agents/{agent_id}/sessions:
     get:
       responses:
@@ -1727,9 +1918,9 @@ paths:
         '200':
           description: A list of graders.
           content:
-            application/jsonl:
+            application/json:
               schema:
-                $ref: '#/components/schemas/Grader'
+                $ref: '#/components/schemas/ListGradersResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -1902,6 +2093,53 @@ paths:
           required: false
           schema:
             $ref: '#/components/schemas/URL'
+  /v1/scoring-functions:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListScoringFunctionsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      description: ''
+      parameters: []
+    post:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
+        required: true
   /v1/shields:
     get:
       responses:
@@ -2336,6 +2574,43 @@ paths:
             schema:
               $ref: '#/components/schemas/RunRequest'
         required: true
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: >-
+            The job that was created to run the evaluation.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Eval
+      description: Run an evaluation on a benchmark.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: >-
+            The ID of the benchmark to run the evaluation on.
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunEvalRequest'
+        required: true
   /v1/safety/run-shield:
     post:
       responses:
@@ -2419,6 +2694,65 @@ paths:
             schema:
               $ref: '#/components/schemas/SaveSpansToDatasetRequest'
         required: true
+  /v1/scoring/score:
+    post:
+      responses:
+        '200':
+          description: >-
+            ScoreResponse object containing rows and aggregated results
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ScoreResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Scoring
+      description: Score a list of rows.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/ScoreRequest'
+        required: true
+  /v1/scoring/score-batch:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ScoreBatchResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Scoring
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/ScoreBatchRequest'
+        required: true
   /v1/post-training/supervised-fine-tune:
     post:
       responses:
@@ -4308,6 +4642,251 @@ components:
       title: EmbeddingsResponse
       description: >-
         Response containing generated embeddings.
+    AgentCandidate:
+      type: object
+      properties:
+        type:
+          type: string
+          const: agent
+          default: agent
+        config:
+          $ref: '#/components/schemas/AgentConfig'
+          description: >-
+            The configuration for the agent candidate.
+      additionalProperties: false
+      required:
+        - type
+        - config
+      title: AgentCandidate
+      description: An agent candidate for evaluation.
+    AggregationFunctionType:
+      type: string
+      enum:
+        - average
+        - median
+        - categorical_count
+        - accuracy
+      title: AggregationFunctionType
+    BasicScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: basic
+          default: basic
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+      title: BasicScoringFnParams
+    BenchmarkConfig:
+      type: object
+      properties:
+        eval_candidate:
+          $ref: '#/components/schemas/EvalCandidate'
+          description: The candidate to evaluate.
+        scoring_params:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringFnParams'
+          description: >-
+            Map between scoring function id and parameters for each scoring function
+            you want to run
+        num_examples:
+          type: integer
+          description: >-
+            (Optional) The number of examples to evaluate. If not provided, all examples
+            in the dataset will be evaluated
+      additionalProperties: false
+      required:
+        - eval_candidate
+        - scoring_params
+      title: BenchmarkConfig
+      description: >-
+        A benchmark configuration for evaluation.
+    EvalCandidate:
+      oneOf:
+        - $ref: '#/components/schemas/ModelCandidate'
+        - $ref: '#/components/schemas/AgentCandidate'
+      discriminator:
+        propertyName: type
+        mapping:
+          model: '#/components/schemas/ModelCandidate'
+          agent: '#/components/schemas/AgentCandidate'
+    LLMAsJudgeScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: llm_as_judge
+          default: llm_as_judge
+        judge_model:
+          type: string
+        prompt_template:
+          type: string
+        judge_score_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+        - judge_model
+      title: LLMAsJudgeScoringFnParams
+    ModelCandidate:
+      type: object
+      properties:
+        type:
+          type: string
+          const: model
+          default: model
+        model:
+          type: string
+          description: The model ID to evaluate.
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+          description: The sampling parameters for the model.
+        system_message:
+          $ref: '#/components/schemas/SystemMessage'
+          description: >-
+            (Optional) The system message providing instructions or context to the
+            model.
+      additionalProperties: false
+      required:
+        - type
+        - model
+        - sampling_params
+      title: ModelCandidate
+      description: A model candidate for evaluation.
+    RegexParserScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: regex_parser
+          default: regex_parser
+        parsing_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+      title: RegexParserScoringFnParams
+    ScoringFnParams:
+      oneOf:
+        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+        - $ref: '#/components/schemas/RegexParserScoringFnParams'
+        - $ref: '#/components/schemas/BasicScoringFnParams'
+      discriminator:
+        propertyName: type
+        mapping:
+          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+          basic: '#/components/schemas/BasicScoringFnParams'
+    EvaluateRowsRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: The rows to evaluate.
+        scoring_functions:
+          type: array
+          items:
+            type: string
+          description: >-
+            The scoring functions to use for the evaluation.
+        benchmark_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+          description: The configuration for the benchmark.
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+        - benchmark_config
+      title: EvaluateRowsRequest
+    EvaluateResponse:
+      type: object
+      properties:
+        generations:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: The generations from the evaluation.
+        scores:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+          description: The scores from the evaluation.
+      additionalProperties: false
+      required:
+        - generations
+        - scores
+      title: EvaluateResponse
+      description: The response from an evaluation.
+    ScoringResult:
+      type: object
+      properties:
+        score_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The scoring result for each row. Each row is a map of column name to value.
+        aggregated_results:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Map of metric name to aggregated value
+      additionalProperties: false
+      required:
+        - score_rows
+        - aggregated_results
+      title: ScoringResult
+      description: A scoring result for a single row.
     Agent:
       type: object
       properties:
@@ -4803,6 +5382,179 @@ components:
         - llm
         - embedding
       title: ModelType
+    AgentTurnInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: agent_turn_input
+          default: agent_turn_input
+      additionalProperties: false
+      required:
+        - type
+      title: AgentTurnInputType
+    ArrayType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: array
+          default: array
+      additionalProperties: false
+      required:
+        - type
+      title: ArrayType
+    BooleanType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: boolean
+          default: boolean
+      additionalProperties: false
+      required:
+        - type
+      title: BooleanType
+    ChatCompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: chat_completion_input
+          default: chat_completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: ChatCompletionInputType
+    CompletionInputType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: completion_input
+          default: completion_input
+      additionalProperties: false
+      required:
+        - type
+      title: CompletionInputType
+    JsonType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: json
+          default: json
+      additionalProperties: false
+      required:
+        - type
+      title: JsonType
+    NumberType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: number
+          default: number
+      additionalProperties: false
+      required:
+        - type
+      title: NumberType
+    ObjectType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: object
+          default: object
+      additionalProperties: false
+      required:
+        - type
+      title: ObjectType
+    ParamType:
+      oneOf:
+        - $ref: '#/components/schemas/StringType'
+        - $ref: '#/components/schemas/NumberType'
+        - $ref: '#/components/schemas/BooleanType'
+        - $ref: '#/components/schemas/ArrayType'
+        - $ref: '#/components/schemas/ObjectType'
+        - $ref: '#/components/schemas/JsonType'
+        - $ref: '#/components/schemas/UnionType'
+        - $ref: '#/components/schemas/ChatCompletionInputType'
+        - $ref: '#/components/schemas/CompletionInputType'
+        - $ref: '#/components/schemas/AgentTurnInputType'
+      discriminator:
+        propertyName: type
+        mapping:
+          string: '#/components/schemas/StringType'
+          number: '#/components/schemas/NumberType'
+          boolean: '#/components/schemas/BooleanType'
+          array: '#/components/schemas/ArrayType'
+          object: '#/components/schemas/ObjectType'
+          json: '#/components/schemas/JsonType'
+          union: '#/components/schemas/UnionType'
+          chat_completion_input: '#/components/schemas/ChatCompletionInputType'
+          completion_input: '#/components/schemas/CompletionInputType'
+          agent_turn_input: '#/components/schemas/AgentTurnInputType'
+    ScoringFn:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: scoring_function
+          default: scoring_function
+        description:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+        return_type:
+          $ref: '#/components/schemas/ParamType'
+        params:
+          $ref: '#/components/schemas/ScoringFnParams'
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - metadata
+        - return_type
+      title: ScoringFn
+    StringType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: string
+          default: string
+      additionalProperties: false
+      required:
+        - type
+      title: StringType
+    UnionType:
+      type: object
+      properties:
+        type:
+          type: string
+          const: union
+          default: union
+      additionalProperties: false
+      required:
+        - type
+      title: UnionType
     Shield:
       type: object
       properties:
@@ -5141,27 +5893,27 @@ components:
         - embedding_model
         - embedding_dimension
       title: VectorDB
-    BenchmarkTask:
+    BenchmarkEvaluationTask:
       type: object
       properties:
         type:
           type: string
-          const: benchmark_id
-          default: benchmark_id
+          const: benchmark
+          default: benchmark
         benchmark_id:
           type: string
       additionalProperties: false
       required:
         - type
         - benchmark_id
-      title: BenchmarkTask
-    DataSourceGraderTask:
+      title: BenchmarkEvaluationTask
+    DataEvaluationTask:
       type: object
       properties:
         type:
           type: string
-          const: data_source_grader
-          default: data_source_grader
+          const: data
+          default: data
         data_source:
           $ref: '#/components/schemas/DataSource'
         grader_ids:
@@ -5173,14 +5925,14 @@ components:
         - type
         - data_source
         - grader_ids
-      title: DataSourceGraderTask
-    DatasetGraderTask:
+      title: DataEvaluationTask
+    DatasetEvaluationTask:
       type: object
       properties:
         type:
           type: string
-          const: dataset_grader
-          default: dataset_grader
+          const: dataset
+          default: dataset
         dataset_id:
           type: string
         grader_ids:
@@ -5192,18 +5944,18 @@ components:
         - type
         - dataset_id
         - grader_ids
-      title: DatasetGraderTask
+      title: DatasetEvaluationTask
     EvaluationTask:
       oneOf:
-        - $ref: '#/components/schemas/BenchmarkTask'
-        - $ref: '#/components/schemas/DatasetGraderTask'
-        - $ref: '#/components/schemas/DataSourceGraderTask'
+        - $ref: '#/components/schemas/BenchmarkEvaluationTask'
+        - $ref: '#/components/schemas/DatasetEvaluationTask'
+        - $ref: '#/components/schemas/DataEvaluationTask'
       discriminator:
         propertyName: type
         mapping:
-          benchmark_id: '#/components/schemas/BenchmarkTask'
-          dataset_grader: '#/components/schemas/DatasetGraderTask'
-          data_source_grader: '#/components/schemas/DataSourceGraderTask'
+          benchmark: '#/components/schemas/BenchmarkEvaluationTask'
+          dataset: '#/components/schemas/DatasetEvaluationTask'
+          data: '#/components/schemas/DataEvaluationTask'
     GradeRequest:
       type: object
       properties:
@@ -5218,23 +5970,6 @@ components:
       required:
         - task
       title: GradeRequest
-    AgentCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: agent
-          default: agent
-        config:
-          $ref: '#/components/schemas/AgentConfig'
-          description: >-
-            The configuration for the agent candidate.
-      additionalProperties: false
-      required:
-        - type
-        - config
-      title: AgentCandidate
-      description: An agent candidate for evaluation.
     EvaluationCandidate:
       oneOf:
         - $ref: '#/components/schemas/ModelCandidate'
@@ -5263,10 +5998,10 @@ components:
           type: string
           format: date-time
           description: The time the job was created.
-        ended_at:
+        completed_at:
           type: string
           format: date-time
-          description: The time the job ended.
+          description: The time the job completed.
         error:
           type: string
           description: >-
@@ -5288,30 +6023,6 @@ components:
         - task
         - candidate
       title: EvaluationJob
-    ModelCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: model
-          default: model
-        model_id:
-          type: string
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model.
-        system_message:
-          $ref: '#/components/schemas/SystemMessage'
-          description: >-
-            (Optional) The system message providing instructions or context to the
-            model.
-      additionalProperties: false
-      required:
-        - type
-        - model_id
-        - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
     GradeSyncRequest:
       type: object
       properties:
@@ -5355,41 +6066,6 @@ components:
         - scores
       title: EvaluationResponse
       description: A response to an inline evaluation.
-    ScoringResult:
-      type: object
-      properties:
-        scores:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: >-
-            The scoring result for each row. Each row is a map of grader column name
-            to value.
-        metrics:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Map of metric name to aggregated value.
-      additionalProperties: false
-      required:
-        - scores
-        - metrics
-      title: ScoringResult
-      description: A scoring result for a single row.
     HealthInfo:
       type: object
       properties:
@@ -5576,6 +6252,15 @@ components:
         - data
       title: IterrowsResponse
       description: A paginated list of rows from a dataset.
+    JobStatus:
+      type: string
+      enum:
+        - completed
+        - in_progress
+        - failed
+        - scheduled
+        - cancelled
+      title: JobStatus
     ListAgentSessionsResponse:
       type: object
       properties:
@@ -5707,6 +6392,17 @@ components:
       required:
         - data
       title: ListGraderTypesResponse
+    ListGradersResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Grader'
+      additionalProperties: false
+      required:
+        - data
+      title: ListGradersResponse
     ListModelsResponse:
       type: object
       properties:
@@ -5757,6 +6453,17 @@ components:
       required:
         - data
       title: ListRoutesResponse
+    ListScoringFunctionsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/ScoringFn'
+      additionalProperties: false
+      required:
+        - data
+      title: ListScoringFunctionsResponse
     ListShieldsResponse:
       type: object
       properties:
@@ -6394,7 +7101,7 @@ components:
         dataset_id:
           type: string
           description: >-
-            The ID of the dataset to used to run the benchmark.
+            The ID of the dataset to be used to run the benchmark.
         grader_ids:
           type: array
           items:
@@ -6532,6 +7239,27 @@ components:
       required:
         - model_id
       title: RegisterModelRequest
+    RegisterScoringFunctionRequest:
+      type: object
+      properties:
+        scoring_fn_id:
+          type: string
+        description:
+          type: string
+        return_type:
+          $ref: '#/components/schemas/ParamType'
+        provider_scoring_fn_id:
+          type: string
+        provider_id:
+          type: string
+        params:
+          $ref: '#/components/schemas/ScoringFnParams'
+      additionalProperties: false
+      required:
+        - scoring_fn_id
+        - description
+        - return_type
+      title: RegisterScoringFunctionRequest
     RegisterShieldRequest:
       type: object
       properties:
@@ -6631,6 +7359,25 @@ components:
         - task
         - candidate
       title: RunRequest
+    RunEvalRequest:
+      type: object
+      properties:
+        benchmark_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+          description: The configuration for the benchmark.
+      additionalProperties: false
+      required:
+        - benchmark_config
+      title: RunEvalRequest
+    Job:
+      type: object
+      properties:
+        job_id:
+          type: string
+      additionalProperties: false
+      required:
+        - job_id
+      title: Job
     RunShieldRequest:
       type: object
       properties:
@@ -6702,6 +7449,81 @@ components:
         - attributes_to_save
         - dataset_id
       title: SaveSpansToDatasetRequest
+    ScoreRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: The rows to score.
+        scoring_functions:
+          type: object
+          additionalProperties:
+            oneOf:
+              - $ref: '#/components/schemas/ScoringFnParams'
+              - type: 'null'
+          description: >-
+            The scoring functions to use for the scoring.
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+      title: ScoreRequest
+    ScoreResponse:
+      type: object
+      properties:
+        results:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+          description: >-
+            A map of scoring function name to ScoringResult.
+      additionalProperties: false
+      required:
+        - results
+      title: ScoreResponse
+      description: The response from scoring.
+    ScoreBatchRequest:
+      type: object
+      properties:
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: object
+          additionalProperties:
+            oneOf:
+              - $ref: '#/components/schemas/ScoringFnParams'
+              - type: 'null'
+        save_results_dataset:
+          type: boolean
+      additionalProperties: false
+      required:
+        - dataset_id
+        - scoring_functions
+        - save_results_dataset
+      title: ScoreBatchRequest
+    ScoreBatchResponse:
+      type: object
+      properties:
+        dataset_id:
+          type: string
+        results:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+      additionalProperties: false
+      required:
+        - results
+      title: ScoreBatchResponse
     AlgorithmConfig:
       oneOf:
         - $ref: '#/components/schemas/LoraFinetuningConfig'
@@ -6939,6 +7761,9 @@ tags:
   - name: Benchmarks
   - name: DatasetIO
   - name: Datasets
+  - name: Eval
+    x-displayName: >-
+      Llama Stack Evaluation API for running evaluations on model and agent candidates.
   - name: Evaluation
   - name: Files
   - name: Graders
@@ -6961,6 +7786,8 @@ tags:
     x-displayName: >-
       Providers API for inspecting, listing, and modifying providers and their configurations.
   - name: Safety
+  - name: Scoring
+  - name: ScoringFunctions
   - name: Shields
   - name: SyntheticDataGeneration (Coming Soon)
   - name: Telemetry
@@ -6976,6 +7803,7 @@ x-tagGroups:
       - Benchmarks
       - DatasetIO
       - Datasets
+      - Eval
       - Evaluation
       - Files
       - Graders
@@ -6985,6 +7813,8 @@ x-tagGroups:
       - PostTraining (Coming Soon)
       - Providers
       - Safety
+      - Scoring
+      - ScoringFunctions
       - Shields
       - SyntheticDataGeneration (Coming Soon)
       - Telemetry
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 11db4d350..8017e5c27 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -62,7 +62,7 @@ class Benchmarks(Protocol):
         """
         Register a new benchmark.
 
-        :param dataset_id: The ID of the dataset to used to run the benchmark.
+        :param dataset_id: The ID of the dataset to be used to run the benchmark.
         :param grader_ids: List of grader ids to use for this benchmark.
         :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
         :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
@@ -87,3 +87,10 @@ class Benchmarks(Protocol):
         :param benchmark_id: The ID of the benchmark to get.
         """
         ...
+
+    @webmethod(route="/benchmarks/{benchmark_id}", method="DELETE")
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """
+        Unregister a benchmark by ID.
+        """
+        ...
diff --git a/llama_stack/apis/common/job_types.py b/llama_stack/apis/common/job_types.py
index 57775754b..307e3fa54 100644
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@@ -5,7 +5,6 @@
 # the root directory of this source tree.
 from datetime import datetime
 from enum import Enum
-from typing import Optional
 
 from pydantic import BaseModel
 
@@ -38,12 +37,12 @@ class CommonJobFields(BaseModel):
     :param id: The ID of the job.
     :param status: The status of the job.
     :param created_at: The time the job was created.
-    :param ended_at: The time the job ended.
+    :param completed_at: The time the job completed.
     :param error: If status of the job is failed, this will contain the error message.
     """
 
     id: str
     status: JobStatus
     created_at: datetime
-    ended_at: Optional[datetime] = None
-    error: Optional[str] = None
+    completed_at: datetime | None = None
+    error: str | None = None
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index faa620872..e667acfd4 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -48,28 +48,28 @@ EvaluationCandidate = register_schema(
 
 
 @json_schema_type
-class BenchmarkTask(BaseModel):
-    type: Literal["benchmark_id"] = "benchmark_id"
+class BenchmarkEvaluationTask(BaseModel):
+    type: Literal["benchmark"] = "benchmark"
     benchmark_id: str
 
 
 @json_schema_type
-class DatasetGraderTask(BaseModel):
-    type: Literal["dataset_grader"] = "dataset_grader"
+class DatasetEvaluationTask(BaseModel):
+    type: Literal["dataset"] = "dataset"
     dataset_id: str
     grader_ids: List[str]
 
 
 @json_schema_type
-class DataSourceGraderTask(BaseModel):
-    type: Literal["data_source_grader"] = "data_source_grader"
+class DataEvaluationTask(BaseModel):
+    type: Literal["data"] = "data"
     data_source: DataSource
     grader_ids: List[str]
 
 
 EvaluationTask = register_schema(
     Annotated[
-        Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask],
+        Union[BenchmarkEvaluationTask, DatasetEvaluationTask, DataEvaluationTask],
         Field(discriminator="type"),
     ],
     name="EvaluationTask",
diff --git a/llama_stack/apis/graders/graders.py b/llama_stack/apis/graders/graders.py
index 98b85552a..522559c3f 100644
--- a/llama_stack/apis/graders/graders.py
+++ b/llama_stack/apis/graders/graders.py
@@ -29,6 +29,13 @@ from .graders import *  # noqa: F401 F403
 class GraderType(Enum):
     """
     A type of grader. Each type is a criteria for evaluating answers.
+
+    :cvar llm: Use an LLM to score the answer.
+    :cvar regex_parser: Use a regex parser to score the answer.
+    :cvar equality: Check if the answer is equal to the reference answer.
+    :cvar subset_of: Check if the answer is a subset of the reference answer.
+    :cvar factuality: Check if the answer is factually correct using LLM as judge.
+    :cvar faithfulness: Check if the answer is faithful to the reference answer using LLM as judge.
     """
 
     llm = "llm"
@@ -221,9 +228,9 @@ class Graders(Protocol):
         ...
 
     @webmethod(route="/graders/{grader_id:path}", method="DELETE")
-    async def delete_grader(self, grader_id: str) -> None:
+    async def unregister_grader(self, grader_id: str) -> None:
         """
-        Delete a grader by ID.
+        Unregister a grader by ID.
         :param grader_id: The ID of the grader.
         """
         ...
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index b4862537a..a5fa0fe39 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -17,6 +17,7 @@ from llama_stack.apis.batch_inference import BatchInference
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.eval import Eval
 from llama_stack.apis.evaluation import Evaluation
 from llama_stack.apis.files import Files
 from llama_stack.apis.graders import Graders
@@ -26,6 +27,8 @@ from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.providers import Providers
 from llama_stack.apis.safety import Safety
+from llama_stack.apis.scoring import Scoring
+from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
 from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
 from llama_stack.apis.telemetry import Telemetry
@@ -66,6 +69,9 @@ class LlamaStack(
     Files,
     Graders,
     Evaluation,
+    Eval,
+    ScoringFunctions,
+    Scoring,
 ):
     pass
 
@@ -111,7 +117,9 @@ class EnvVarError(Exception):
     def __init__(self, var_name: str, path: str = ""):
         self.var_name = var_name
         self.path = path
-        super().__init__(f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}")
+        super().__init__(
+            f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}"
+        )
 
 
 def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]:
@@ -202,7 +210,9 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
         if not key:
             raise ValueError(f"Empty key in environment variable pair: {env_pair}")
         if not all(c.isalnum() or c == "_" for c in key):
-            raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}")
+            raise ValueError(
+                f"Key must contain only alphanumeric characters and underscores: {key}"
+            )
         return key, value
     except ValueError as e:
         raise ValueError(
@@ -215,14 +225,20 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
 async def construct_stack(
     run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None
 ) -> Dict[Api, Any]:
-    dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
-    impls = await resolve_impls(run_config, provider_registry or get_provider_registry(), dist_registry)
+    dist_registry, _ = await create_dist_registry(
+        run_config.metadata_store, run_config.image_name
+    )
+    impls = await resolve_impls(
+        run_config, provider_registry or get_provider_registry(), dist_registry
+    )
     await register_resources(run_config, impls)
     return impls
 
 
 def get_stack_run_config_from_template(template: str) -> StackRunConfig:
-    template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+    template_path = (
+        importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+    )
 
     with importlib.resources.as_file(template_path) as path:
         if not path.exists():
@@ -265,7 +281,9 @@ def run_config_from_adhoc_config_spec(
 
         # call method "sample_run_config" on the provider spec config class
         provider_config_type = instantiate_class_type(provider_spec.config_class)
-        provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir))
+        provider_config = replace_env_vars(
+            provider_config_type.sample_run_config(__distro_dir__=distro_dir)
+        )
 
         provider_configs_by_api[api_str] = [
             Provider(