From 035b2dcb605a2ebfe33ec2fdd132e379690284e8 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Sun, 16 Mar 2025 19:33:57 -0700
Subject: [PATCH] new apis

---
 docs/_static/llama-stack-spec.html        | 2499 ++++++++++-----------
 docs/_static/llama-stack-spec.yaml        | 1738 +++++++-------
 llama_stack/apis/benchmarks/benchmarks.py |   51 +-
 llama_stack/apis/common/job_types.py      |   36 +-
 llama_stack/apis/eval/eval.py             |    8 +-
 llama_stack/apis/evaluation/__init__.py   |    7 +
 llama_stack/apis/evaluation/evaluation.py |  175 ++
 llama_stack/apis/graders/__init__.py      |    7 +
 llama_stack/distribution/stack.py         |   34 +-
 9 files changed, 2365 insertions(+), 2190 deletions(-)
 create mode 100644 llama_stack/apis/evaluation/__init__.py
 create mode 100644 llama_stack/apis/evaluation/evaluation.py
 create mode 100644 llama_stack/apis/graders/__init__.py

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index e3c81ddb9..d6f420cae 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -866,6 +866,83 @@
                 ]
             }
         },
+        "/v1/graders/{grader_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The grader.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Grader"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Graders"
+                ],
+                "description": "Get a grader by ID.",
+                "parameters": [
+                    {
+                        "name": "grader_id",
+                        "in": "path",
+                        "description": "The ID of the grader.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Graders"
+                ],
+                "description": "Delete a grader by ID.",
+                "parameters": [
+                    {
+                        "name": "grader_id",
+                        "in": "path",
+                        "description": "The ID of the grader.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/inference/embeddings": {
             "post": {
                 "responses": {
@@ -909,59 +986,6 @@
                 }
             }
         },
-        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "EvaluateResponse object containing generations and scores",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluateResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "Evaluate a list of rows on a benchmark.",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "description": "The ID of the benchmark to run the evaluation on.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/EvaluateRowsRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
             "get": {
                 "responses": {
@@ -1101,14 +1125,7 @@
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Benchmark"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
+                                    "$ref": "#/components/schemas/Benchmark"
                                 }
                             }
                         }
@@ -1129,11 +1146,12 @@
                 "tags": [
                     "Benchmarks"
                 ],
-                "description": "",
+                "description": "Get a benchmark by ID.",
                 "parameters": [
                     {
                         "name": "benchmark_id",
                         "in": "path",
+                        "description": "The ID of the benchmark to get.",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -1306,55 +1324,6 @@
                 ]
             }
         },
-        "/v1/scoring-functions/{scoring_fn_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/ScoringFn"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "ScoringFunctions"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "scoring_fn_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/v1/shields/{identifier}": {
             "get": {
                 "responses": {
@@ -1987,6 +1956,92 @@
                 ]
             }
         },
+        "/v1/evaluation/grade": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The evaluation job containing grader scores.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluationJob"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Evaluation"
+                ],
+                "description": "Run an grading job with generated results. Use this when you have generated results from inference in a dataset.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/GradeRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/evaluation/grade_inline": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The evaluation job containing grader scores. \"generations\" is not populated in the response.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluationResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Evaluation"
+                ],
+                "description": "Run an grading job with generated results inline.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/GradeInlineRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/health": {
             "get": {
                 "responses": {
@@ -2238,160 +2293,6 @@
                 ]
             }
         },
-        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "The status of the evaluationjob.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/JobStatus"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "Get the status of a job.",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "description": "The ID of the benchmark to run the evaluation on.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "job_id",
-                        "in": "path",
-                        "description": "The ID of the job to get the status of.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            },
-            "delete": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "Cancel a job.",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "description": "The ID of the benchmark to run the evaluation on.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "job_id",
-                        "in": "path",
-                        "description": "The ID of the job to cancel.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
-        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "The result of the job.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/EvaluateResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Eval"
-                ],
-                "description": "Get the result of a job.",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "description": "The ID of the benchmark to run the evaluation on.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    },
-                    {
-                        "name": "job_id",
-                        "in": "path",
-                        "description": "The ID of the job to get the result of.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/v1/agents/{agent_id}/sessions": {
             "get": {
                 "responses": {
@@ -2464,13 +2365,20 @@
                 "tags": [
                     "Benchmarks"
                 ],
-                "description": "",
+                "description": "List all benchmarks.",
                 "parameters": []
             },
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK"
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Benchmark"
+                                }
+                            }
+                        }
                     },
                     "400": {
                         "$ref": "#/components/responses/BadRequest400"
@@ -2488,7 +2396,7 @@
                 "tags": [
                     "Benchmarks"
                 ],
-                "description": "",
+                "description": "Register a new benchmark.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -2619,6 +2527,113 @@
                 ]
             }
         },
+        "/v1/graders/types": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A list of grader types and information about the types.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListGraderTypesResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Graders"
+                ],
+                "description": "List all grader types.",
+                "parameters": []
+            }
+        },
+        "/v1/graders": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A list of graders.",
+                        "content": {
+                            "application/jsonl": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Grader"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Graders"
+                ],
+                "description": "List all graders.",
+                "parameters": []
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The registered grader.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Grader"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Graders"
+                ],
+                "description": "Register a new grader.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RegisterGraderRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/models": {
             "get": {
                 "responses": {
@@ -2809,73 +2824,6 @@
                 ]
             }
         },
-        "/v1/scoring-functions": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ListScoringFunctionsResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "ScoringFunctions"
-                ],
-                "description": "",
-                "parameters": []
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "ScoringFunctions"
-                ],
-                "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RegisterScoringFunctionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/shields": {
             "get": {
                 "responses": {
@@ -3460,15 +3408,15 @@
                 }
             }
         },
-        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
+        "/v1/evaluation/run": {
             "post": {
                 "responses": {
                     "200": {
-                        "description": "The job that was created to run the evaluation.",
+                        "description": "OK",
                         "content": {
                             "application/json": {
                                 "schema": {
-                                    "$ref": "#/components/schemas/Job"
+                                    "$ref": "#/components/schemas/EvaluationJob"
                                 }
                             }
                         }
@@ -3487,25 +3435,58 @@
                     }
                 },
                 "tags": [
-                    "Eval"
-                ],
-                "description": "Run an evaluation on a benchmark.",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "description": "The ID of the benchmark to run the evaluation on.",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
+                    "Evaluation"
                 ],
+                "description": "Run an evaluation job.",
+                "parameters": [],
                 "requestBody": {
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/RunEvalRequest"
+                                "$ref": "#/components/schemas/RunRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
+        "/v1/evaluation/run_inline": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluationResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Evaluation"
+                ],
+                "description": "Run an evaluation job inline.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunInlineRequest"
                             }
                         }
                     },
@@ -3592,92 +3573,6 @@
                 }
             }
         },
-        "/v1/scoring/score": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "ScoreResponse object containing rows and aggregated results",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ScoreResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Scoring"
-                ],
-                "description": "Score a list of rows.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/ScoreRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
-        "/v1/scoring/score-batch": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ScoreBatchResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Scoring"
-                ],
-                "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/ScoreBatchRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/post-training/supervised-fine-tune": {
             "post": {
                 "responses": {
@@ -6303,381 +6198,6 @@
                 "title": "EmbeddingsResponse",
                 "description": "Response containing generated embeddings."
             },
-            "AgentCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "agent",
-                        "default": "agent"
-                    },
-                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig",
-                        "description": "The configuration for the agent candidate."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "config"
-                ],
-                "title": "AgentCandidate",
-                "description": "An agent candidate for evaluation."
-            },
-            "AggregationFunctionType": {
-                "type": "string",
-                "enum": [
-                    "average",
-                    "median",
-                    "categorical_count",
-                    "accuracy"
-                ],
-                "title": "AggregationFunctionType"
-            },
-            "BasicScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "basic",
-                        "default": "basic"
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "BasicScoringFnParams"
-            },
-            "BenchmarkConfig": {
-                "type": "object",
-                "properties": {
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate",
-                        "description": "The candidate to evaluate."
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        },
-                        "description": "Map between scoring function id and parameters for each scoring function you want to run"
-                    },
-                    "num_examples": {
-                        "type": "integer",
-                        "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "eval_candidate",
-                    "scoring_params"
-                ],
-                "title": "BenchmarkConfig",
-                "description": "A benchmark configuration for evaluation."
-            },
-            "EvalCandidate": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/ModelCandidate"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AgentCandidate"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "model": "#/components/schemas/ModelCandidate",
-                        "agent": "#/components/schemas/AgentCandidate"
-                    }
-                }
-            },
-            "LLMAsJudgeScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "llm_as_judge",
-                        "default": "llm_as_judge"
-                    },
-                    "judge_model": {
-                        "type": "string"
-                    },
-                    "prompt_template": {
-                        "type": "string"
-                    },
-                    "judge_score_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "judge_model"
-                ],
-                "title": "LLMAsJudgeScoringFnParams"
-            },
-            "ModelCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "model",
-                        "default": "model"
-                    },
-                    "model": {
-                        "type": "string",
-                        "description": "The model ID to evaluate."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "The sampling parameters for the model."
-                    },
-                    "system_message": {
-                        "$ref": "#/components/schemas/SystemMessage",
-                        "description": "(Optional) The system message providing instructions or context to the model."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "model",
-                    "sampling_params"
-                ],
-                "title": "ModelCandidate",
-                "description": "A model candidate for evaluation."
-            },
-            "RegexParserScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "regex_parser",
-                        "default": "regex_parser"
-                    },
-                    "parsing_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "RegexParserScoringFnParams"
-            },
-            "ScoringFnParams": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/BasicScoringFnParams"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
-                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
-                        "basic": "#/components/schemas/BasicScoringFnParams"
-                    }
-                }
-            },
-            "EvaluateRowsRequest": {
-                "type": "object",
-                "properties": {
-                    "input_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The rows to evaluate."
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        },
-                        "description": "The scoring functions to use for the evaluation."
-                    },
-                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
-                        "description": "The configuration for the benchmark."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "input_rows",
-                    "scoring_functions",
-                    "benchmark_config"
-                ],
-                "title": "EvaluateRowsRequest"
-            },
-            "EvaluateResponse": {
-                "type": "object",
-                "properties": {
-                    "generations": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The generations from the evaluation."
-                    },
-                    "scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        },
-                        "description": "The scores from the evaluation."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "generations",
-                    "scores"
-                ],
-                "title": "EvaluateResponse",
-                "description": "The response from an evaluation."
-            },
-            "ScoringResult": {
-                "type": "object",
-                "properties": {
-                    "score_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The scoring result for each row. Each row is a map of column name to value."
-                    },
-                    "aggregated_results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "Map of metric name to aggregated value"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "score_rows",
-                    "aggregated_results"
-                ],
-                "title": "ScoringResult",
-                "description": "A scoring result for a single row."
-            },
             "Agent": {
                 "type": "object",
                 "properties": {
@@ -6783,13 +6303,15 @@
                         "default": "benchmark"
                     },
                     "dataset_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The ID of the dataset to used to run the benchmark."
                     },
-                    "scoring_functions": {
+                    "grader_ids": {
                         "type": "array",
                         "items": {
                             "type": "string"
-                        }
+                        },
+                        "description": "The grader ids to use for this benchmark."
                     },
                     "metadata": {
                         "type": "object",
@@ -6814,7 +6336,8 @@
                                     "type": "object"
                                 }
                             ]
-                        }
+                        },
+                        "description": "Metadata for this benchmark for additional descriptions."
                     }
                 },
                 "additionalProperties": false,
@@ -6824,7 +6347,7 @@
                     "provider_id",
                     "type",
                     "dataset_id",
-                    "scoring_functions",
+                    "grader_ids",
                     "metadata"
                 ],
                 "title": "Benchmark"
@@ -6981,6 +6504,361 @@
                 "title": "URIDataSource",
                 "description": "A dataset that can be obtained from a URI."
             },
+            "EqualityGrader": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "equality",
+                        "default": "equality"
+                    },
+                    "equality": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "aggregation_functions"
+                        ],
+                        "title": "BasicGraderParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "equality"
+                ],
+                "title": "EqualityGrader"
+            },
+            "FactualityGrader": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "factuality",
+                        "default": "factuality"
+                    },
+                    "factuality": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "aggregation_functions"
+                        ],
+                        "title": "BasicGraderParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "factuality"
+                ],
+                "title": "FactualityGrader"
+            },
+            "FaithfulnessGrader": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "faithfulness",
+                        "default": "faithfulness"
+                    },
+                    "faithfulness": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "aggregation_functions"
+                        ],
+                        "title": "BasicGraderParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "faithfulness"
+                ],
+                "title": "FaithfulnessGrader"
+            },
+            "Grader": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "provider_resource_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "grader",
+                        "default": "grader"
+                    },
+                    "grader": {
+                        "$ref": "#/components/schemas/GraderDefinition"
+                    },
+                    "description": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "grader",
+                    "metadata"
+                ],
+                "title": "Grader"
+            },
+            "GraderDefinition": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/LlmGrader"
+                    },
+                    {
+                        "$ref": "#/components/schemas/RegexParserGrader"
+                    },
+                    {
+                        "$ref": "#/components/schemas/EqualityGrader"
+                    },
+                    {
+                        "$ref": "#/components/schemas/SubsetOfGrader"
+                    },
+                    {
+                        "$ref": "#/components/schemas/FactualityGrader"
+                    },
+                    {
+                        "$ref": "#/components/schemas/FaithfulnessGrader"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "llm": "#/components/schemas/LlmGrader",
+                        "regex_parser": "#/components/schemas/RegexParserGrader",
+                        "equality": "#/components/schemas/EqualityGrader",
+                        "subset_of": "#/components/schemas/SubsetOfGrader",
+                        "factuality": "#/components/schemas/FactualityGrader",
+                        "faithfulness": "#/components/schemas/FaithfulnessGrader"
+                    }
+                }
+            },
+            "LlmGrader": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "llm",
+                        "default": "llm"
+                    },
+                    "llm": {
+                        "type": "object",
+                        "properties": {
+                            "model": {
+                                "type": "string"
+                            },
+                            "prompt": {
+                                "type": "string"
+                            },
+                            "score_regexes": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "model",
+                            "prompt",
+                            "score_regexes",
+                            "aggregation_functions"
+                        ],
+                        "title": "LlmGraderParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "llm"
+                ],
+                "title": "LlmGrader"
+            },
+            "RegexParserGrader": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "regex_parser",
+                        "default": "regex_parser"
+                    },
+                    "regex_parser": {
+                        "type": "object",
+                        "properties": {
+                            "parsing_regexes": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "parsing_regexes",
+                            "aggregation_functions"
+                        ],
+                        "title": "RegexParserGraderParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "regex_parser"
+                ],
+                "title": "RegexParserGrader"
+            },
+            "SubsetOfGrader": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "subset_of",
+                        "default": "subset_of"
+                    },
+                    "subset_of": {
+                        "type": "object",
+                        "properties": {
+                            "aggregation_functions": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string",
+                                    "enum": [
+                                        "average",
+                                        "median",
+                                        "categorical_count",
+                                        "accuracy"
+                                    ],
+                                    "title": "AggregationFunctionType",
+                                    "description": "A type of aggregation function."
+                                }
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "aggregation_functions"
+                        ],
+                        "title": "BasicGraderParams"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "subset_of"
+                ],
+                "title": "SubsetOfGrader"
+            },
             "Model": {
                 "type": "object",
                 "properties": {
@@ -7047,268 +6925,6 @@
                 ],
                 "title": "ModelType"
             },
-            "AgentTurnInputType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "agent_turn_input",
-                        "default": "agent_turn_input"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "AgentTurnInputType"
-            },
-            "ArrayType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "array",
-                        "default": "array"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "ArrayType"
-            },
-            "BooleanType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "boolean",
-                        "default": "boolean"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "BooleanType"
-            },
-            "ChatCompletionInputType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "chat_completion_input",
-                        "default": "chat_completion_input"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "ChatCompletionInputType"
-            },
-            "CompletionInputType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "completion_input",
-                        "default": "completion_input"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "CompletionInputType"
-            },
-            "JsonType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "json",
-                        "default": "json"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "JsonType"
-            },
-            "NumberType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "number",
-                        "default": "number"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "NumberType"
-            },
-            "ObjectType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "object",
-                        "default": "object"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "ObjectType"
-            },
-            "ParamType": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/StringType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/NumberType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/BooleanType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ArrayType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ObjectType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/JsonType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/UnionType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ChatCompletionInputType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/CompletionInputType"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AgentTurnInputType"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "string": "#/components/schemas/StringType",
-                        "number": "#/components/schemas/NumberType",
-                        "boolean": "#/components/schemas/BooleanType",
-                        "array": "#/components/schemas/ArrayType",
-                        "object": "#/components/schemas/ObjectType",
-                        "json": "#/components/schemas/JsonType",
-                        "union": "#/components/schemas/UnionType",
-                        "chat_completion_input": "#/components/schemas/ChatCompletionInputType",
-                        "completion_input": "#/components/schemas/CompletionInputType",
-                        "agent_turn_input": "#/components/schemas/AgentTurnInputType"
-                    }
-                }
-            },
-            "ScoringFn": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "scoring_function",
-                        "default": "scoring_function"
-                    },
-                    "description": {
-                        "type": "string"
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    },
-                    "return_type": {
-                        "$ref": "#/components/schemas/ParamType"
-                    },
-                    "params": {
-                        "$ref": "#/components/schemas/ScoringFnParams"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "identifier",
-                    "provider_resource_id",
-                    "provider_id",
-                    "type",
-                    "metadata",
-                    "return_type"
-                ],
-                "title": "ScoringFn"
-            },
-            "StringType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "string",
-                        "default": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "StringType"
-            },
-            "UnionType": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "union",
-                        "default": "union"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ],
-                "title": "UnionType"
-            },
             "Shield": {
                 "type": "object",
                 "properties": {
@@ -7707,16 +7323,6 @@
                 "title": "PostTrainingJobArtifactsResponse",
                 "description": "Artifacts of a finetuning job."
             },
-            "JobStatus": {
-                "type": "string",
-                "enum": [
-                    "completed",
-                    "in_progress",
-                    "failed",
-                    "scheduled"
-                ],
-                "title": "JobStatus"
-            },
             "PostTrainingJobStatusResponse": {
                 "type": "object",
                 "properties": {
@@ -7724,7 +7330,15 @@
                         "type": "string"
                     },
                     "status": {
-                        "$ref": "#/components/schemas/JobStatus"
+                        "type": "string",
+                        "enum": [
+                            "completed",
+                            "in_progress",
+                            "failed",
+                            "scheduled",
+                            "cancelled"
+                        ],
+                        "title": "JobStatus"
                     },
                     "scheduled_at": {
                         "type": "string",
@@ -7840,6 +7454,363 @@
                 ],
                 "title": "VectorDB"
             },
+            "BenchmarkTask": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "benchmark_id",
+                        "default": "benchmark_id"
+                    },
+                    "benchmark_id": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "benchmark_id"
+                ],
+                "title": "BenchmarkTask"
+            },
+            "DataSourceGraderTask": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "data_source_grader",
+                        "default": "data_source_grader"
+                    },
+                    "data_source": {
+                        "$ref": "#/components/schemas/DataSource"
+                    },
+                    "grader_ids": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "data_source",
+                    "grader_ids"
+                ],
+                "title": "DataSourceGraderTask"
+            },
+            "DatasetGraderTask": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "dataset_grader",
+                        "default": "dataset_grader"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "grader_ids": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "dataset_id",
+                    "grader_ids"
+                ],
+                "title": "DatasetGraderTask"
+            },
+            "EvaluationTask": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/BenchmarkTask"
+                    },
+                    {
+                        "$ref": "#/components/schemas/DatasetGraderTask"
+                    },
+                    {
+                        "$ref": "#/components/schemas/DataSourceGraderTask"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "benchmark_id": "#/components/schemas/BenchmarkTask",
+                        "dataset_grader": "#/components/schemas/DatasetGraderTask",
+                        "data_source_grader": "#/components/schemas/DataSourceGraderTask"
+                    }
+                }
+            },
+            "GradeRequest": {
+                "type": "object",
+                "properties": {
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask",
+                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task"
+                ],
+                "title": "GradeRequest"
+            },
+            "AgentCandidate": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "agent",
+                        "default": "agent"
+                    },
+                    "config": {
+                        "$ref": "#/components/schemas/AgentConfig",
+                        "description": "The configuration for the agent candidate."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "config"
+                ],
+                "title": "AgentCandidate",
+                "description": "An agent candidate for evaluation."
+            },
+            "EvaluationCandidate": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/ModelCandidate"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentCandidate"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "model": "#/components/schemas/ModelCandidate",
+                        "agent": "#/components/schemas/AgentCandidate"
+                    }
+                }
+            },
+            "EvaluationJob": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string",
+                        "description": "The ID of the job."
+                    },
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "completed",
+                            "in_progress",
+                            "failed",
+                            "scheduled",
+                            "cancelled"
+                        ],
+                        "description": "The status of the job."
+                    },
+                    "created_at": {
+                        "type": "string",
+                        "format": "date-time",
+                        "description": "The time the job was created."
+                    },
+                    "ended_at": {
+                        "type": "string",
+                        "format": "date-time",
+                        "description": "The time the job ended."
+                    },
+                    "error": {
+                        "type": "string",
+                        "description": "If status of the job is failed, this will contain the error message."
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "evaluation",
+                        "default": "evaluation"
+                    },
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask"
+                    },
+                    "candidate": {
+                        "$ref": "#/components/schemas/EvaluationCandidate"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "status",
+                    "created_at",
+                    "type",
+                    "task",
+                    "candidate"
+                ],
+                "title": "EvaluationJob"
+            },
+            "ModelCandidate": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "model",
+                        "default": "model"
+                    },
+                    "model_id": {
+                        "type": "string"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "The sampling parameters for the model."
+                    },
+                    "system_message": {
+                        "$ref": "#/components/schemas/SystemMessage",
+                        "description": "(Optional) The system message providing instructions or context to the model."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "model_id",
+                    "sampling_params"
+                ],
+                "title": "ModelCandidate",
+                "description": "A model candidate for evaluation."
+            },
+            "GradeInlineRequest": {
+                "type": "object",
+                "properties": {
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask",
+                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task"
+                ],
+                "title": "GradeInlineRequest"
+            },
+            "EvaluationResponse": {
+                "type": "object",
+                "properties": {
+                    "generations": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The generations in rows for the evaluation."
+                    },
+                    "scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        },
+                        "description": "The scores for the evaluation. Map of grader id to ScoringResult."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "generations",
+                    "scores"
+                ],
+                "title": "EvaluationResponse",
+                "description": "A response to an inline evaluation."
+            },
+            "ScoringResult": {
+                "type": "object",
+                "properties": {
+                    "scores": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "The scoring result for each row. Each row is a map of grader column name to value."
+                    },
+                    "metrics": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        },
+                        "description": "Map of metric name to aggregated value."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "scores",
+                    "metrics"
+                ],
+                "title": "ScoringResult",
+                "description": "A scoring result for a single row."
+            },
             "HealthInfo": {
                 "type": "object",
                 "properties": {
@@ -8285,6 +8256,65 @@
                 "title": "ListFileResponse",
                 "description": "Response representing a list of file entries."
             },
+            "GraderTypeInfo": {
+                "type": "object",
+                "properties": {
+                    "grader_type": {
+                        "type": "string",
+                        "enum": [
+                            "llm",
+                            "regex_parser",
+                            "equality",
+                            "subset_of",
+                            "factuality",
+                            "faithfulness"
+                        ],
+                        "title": "GraderType",
+                        "description": "A type of grader. Each type is a criteria for evaluating answers."
+                    },
+                    "description": {
+                        "type": "string",
+                        "description": "A description of the grader type. - E.g. Write your custom judge prompt to score the answer."
+                    },
+                    "supported_dataset_purposes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string",
+                            "enum": [
+                                "post-training/messages",
+                                "eval/question-answer",
+                                "eval/messages-answer"
+                            ],
+                            "title": "DatasetPurpose",
+                            "description": "Purpose of the dataset. Each purpose has a required input data schema."
+                        },
+                        "description": "The purposes that this grader can be used for."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "grader_type",
+                    "description",
+                    "supported_dataset_purposes"
+                ],
+                "title": "GraderTypeInfo"
+            },
+            "ListGraderTypesResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/GraderTypeInfo"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListGraderTypesResponse"
+            },
             "ListModelsResponse": {
                 "type": "object",
                 "properties": {
@@ -8357,22 +8387,6 @@
                 ],
                 "title": "ListRoutesResponse"
             },
-            "ListScoringFunctionsResponse": {
-                "type": "object",
-                "properties": {
-                    "data": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ScoringFn"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ],
-                "title": "ListScoringFunctionsResponse"
-            },
             "ListShieldsResponse": {
                 "type": "object",
                 "properties": {
@@ -9363,23 +9377,20 @@
             "RegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
-                    "benchmark_id": {
-                        "type": "string"
-                    },
                     "dataset_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The ID of the dataset to used to run the benchmark."
                     },
-                    "scoring_functions": {
+                    "grader_ids": {
                         "type": "array",
                         "items": {
                             "type": "string"
-                        }
+                        },
+                        "description": "List of grader ids to use for this benchmark."
                     },
-                    "provider_benchmark_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
+                    "benchmark_id": {
+                        "type": "string",
+                        "description": "(Optional) The ID of the benchmark to register. If not provided, an ID will be generated."
                     },
                     "metadata": {
                         "type": "object",
@@ -9404,14 +9415,14 @@
                                     "type": "object"
                                 }
                             ]
-                        }
+                        },
+                        "description": "(Optional) Metadata for this benchmark for additional descriptions."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "benchmark_id",
                     "dataset_id",
-                    "scoring_functions"
+                    "grader_ids"
                 ],
                 "title": "RegisterBenchmarkRequest"
             },
@@ -9469,6 +9480,50 @@
                 ],
                 "title": "RegisterDatasetRequest"
             },
+            "RegisterGraderRequest": {
+                "type": "object",
+                "properties": {
+                    "grader": {
+                        "$ref": "#/components/schemas/GraderDefinition",
+                        "description": "The grader definition, E.g. - { \"type\": \"llm\", \"llm\": { \"model\": \"llama-405b\", \"prompt\": \"You are a judge. Score the answer based on the question. {question} {answer}\", } }"
+                    },
+                    "grader_id": {
+                        "type": "string",
+                        "description": "(Optional) The ID of the grader. If not provided, a random ID will be generated."
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        },
+                        "description": "(Optional) Any additional metadata for this grader. - E.g. { \"description\": \"A grader that scores the answer based on the question.\", }"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "grader"
+                ],
+                "title": "RegisterGraderRequest"
+            },
             "RegisterModelRequest": {
                 "type": "object",
                 "properties": {
@@ -9516,36 +9571,6 @@
                 ],
                 "title": "RegisterModelRequest"
             },
-            "RegisterScoringFunctionRequest": {
-                "type": "object",
-                "properties": {
-                    "scoring_fn_id": {
-                        "type": "string"
-                    },
-                    "description": {
-                        "type": "string"
-                    },
-                    "return_type": {
-                        "$ref": "#/components/schemas/ParamType"
-                    },
-                    "provider_scoring_fn_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "params": {
-                        "$ref": "#/components/schemas/ScoringFnParams"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "scoring_fn_id",
-                    "description",
-                    "return_type"
-                ],
-                "title": "RegisterScoringFunctionRequest"
-            },
             "RegisterShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -9682,32 +9707,43 @@
                 ],
                 "title": "ResumeAgentTurnRequest"
             },
-            "RunEvalRequest": {
+            "RunRequest": {
                 "type": "object",
                 "properties": {
-                    "benchmark_config": {
-                        "$ref": "#/components/schemas/BenchmarkConfig",
-                        "description": "The configuration for the benchmark."
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask",
+                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                    },
+                    "candidate": {
+                        "$ref": "#/components/schemas/EvaluationCandidate",
+                        "description": "The candidate to evaluate."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "benchmark_config"
+                    "task",
+                    "candidate"
                 ],
-                "title": "RunEvalRequest"
+                "title": "RunRequest"
             },
-            "Job": {
+            "RunInlineRequest": {
                 "type": "object",
                 "properties": {
-                    "job_id": {
-                        "type": "string"
+                    "task": {
+                        "$ref": "#/components/schemas/EvaluationTask",
+                        "description": "The task to evaluate. One of: - BenchmarkTask: Run evaluation task against a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids"
+                    },
+                    "candidate": {
+                        "$ref": "#/components/schemas/EvaluationCandidate",
+                        "description": "The candidate to evaluate."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "job_id"
+                    "task",
+                    "candidate"
                 ],
-                "title": "Job"
+                "title": "RunInlineRequest"
             },
             "RunShieldRequest": {
                 "type": "object",
@@ -9795,128 +9831,6 @@
                 ],
                 "title": "SaveSpansToDatasetRequest"
             },
-            "ScoreRequest": {
-                "type": "object",
-                "properties": {
-                    "input_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        },
-                        "description": "The rows to score."
-                    },
-                    "scoring_functions": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/ScoringFnParams"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        },
-                        "description": "The scoring functions to use for the scoring."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "input_rows",
-                    "scoring_functions"
-                ],
-                "title": "ScoreRequest"
-            },
-            "ScoreResponse": {
-                "type": "object",
-                "properties": {
-                    "results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        },
-                        "description": "A map of scoring function name to ScoringResult."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "results"
-                ],
-                "title": "ScoreResponse",
-                "description": "The response from scoring."
-            },
-            "ScoreBatchRequest": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_functions": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "$ref": "#/components/schemas/ScoringFnParams"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        }
-                    },
-                    "save_results_dataset": {
-                        "type": "boolean"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "dataset_id",
-                    "scoring_functions",
-                    "save_results_dataset"
-                ],
-                "title": "ScoreBatchRequest"
-            },
-            "ScoreBatchResponse": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "results"
-                ],
-                "title": "ScoreBatchResponse"
-            },
             "AlgorithmConfig": {
                 "oneOf": [
                     {
@@ -10280,12 +10194,14 @@
             "name": "Datasets"
         },
         {
-            "name": "Eval",
-            "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates."
+            "name": "Evaluation"
         },
         {
             "name": "Files"
         },
+        {
+            "name": "Graders"
+        },
         {
             "name": "Inference",
             "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -10307,12 +10223,6 @@
         {
             "name": "Safety"
         },
-        {
-            "name": "Scoring"
-        },
-        {
-            "name": "ScoringFunctions"
-        },
         {
             "name": "Shields"
         },
@@ -10344,16 +10254,15 @@
                 "Benchmarks",
                 "DatasetIO",
                 "Datasets",
-                "Eval",
+                "Evaluation",
                 "Files",
+                "Graders",
                 "Inference",
                 "Inspect",
                 "Models",
                 "PostTraining (Coming Soon)",
                 "Providers",
                 "Safety",
-                "Scoring",
-                "ScoringFunctions",
                 "Shields",
                 "SyntheticDataGeneration (Coming Soon)",
                 "Telemetry",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a3d4dbcc9..db92e7e6a 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -589,6 +589,59 @@ paths:
           required: true
           schema:
             type: string
+  /v1/graders/{grader_id}:
+    get:
+      responses:
+        '200':
+          description: The grader.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Grader'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Graders
+      description: Get a grader by ID.
+      parameters:
+        - name: grader_id
+          in: path
+          description: The ID of the grader.
+          required: true
+          schema:
+            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Graders
+      description: Delete a grader by ID.
+      parameters:
+        - name: grader_id
+          in: path
+          description: The ID of the grader.
+          required: true
+          schema:
+            type: string
   /v1/inference/embeddings:
     post:
       responses:
@@ -622,43 +675,6 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/benchmarks/{benchmark_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: >-
-            EvaluateResponse object containing generations and scores
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Evaluate a list of rows on a benchmark.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsRequest'
-        required: true
   /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
     get:
       responses:
@@ -757,9 +773,7 @@ paths:
           content:
             application/json:
               schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Benchmark'
-                  - type: 'null'
+                $ref: '#/components/schemas/Benchmark'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -772,10 +786,11 @@ paths:
           $ref: '#/components/responses/DefaultError'
       tags:
         - Benchmarks
-      description: ''
+      description: Get a benchmark by ID.
       parameters:
         - name: benchmark_id
           in: path
+          description: The ID of the benchmark to get.
           required: true
           schema:
             type: string
@@ -885,36 +900,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/ScoringFn'
-                  - type: 'null'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ScoringFunctions
-      description: ''
-      parameters:
-        - name: scoring_fn_id
-          in: path
-          required: true
-          schema:
-            type: string
   /v1/shields/{identifier}:
     get:
       responses:
@@ -1326,6 +1311,70 @@ paths:
           required: true
           schema:
             type: string
+  /v1/evaluation/grade:
+    post:
+      responses:
+        '200':
+          description: >-
+            The evaluation job containing grader scores.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluationJob'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Evaluation
+      description: >-
+        Run an grading job with generated results. Use this when you have generated
+        results from inference in a dataset.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/GradeRequest'
+        required: true
+  /v1/evaluation/grade_inline:
+    post:
+      responses:
+        '200':
+          description: >-
+            The evaluation job containing grader scores. "generations" is not populated
+            in the response.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluationResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Evaluation
+      description: >-
+        Run an grading job with generated results inline.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/GradeInlineRequest'
+        required: true
   /v1/health:
     get:
       responses:
@@ -1501,111 +1550,6 @@ paths:
           required: false
           schema:
             type: integer
-  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluationjob.
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/JobStatus'
-                  - type: 'null'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Get the status of a job.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          description: The ID of the job to get the status of.
-          required: true
-          schema:
-            type: string
-    delete:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Cancel a job.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          description: The ID of the job to cancel.
-          required: true
-          schema:
-            type: string
-  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Eval
-      description: Get the result of a job.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
-        - name: job_id
-          in: path
-          description: The ID of the job to get the result of.
-          required: true
-          schema:
-            type: string
   /v1/agents/{agent_id}/sessions:
     get:
       responses:
@@ -1657,12 +1601,16 @@ paths:
           $ref: '#/components/responses/DefaultError'
       tags:
         - Benchmarks
-      description: ''
+      description: List all benchmarks.
       parameters: []
     post:
       responses:
         '200':
           description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Benchmark'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -1675,7 +1623,7 @@ paths:
           $ref: '#/components/responses/DefaultError'
       tags:
         - Benchmarks
-      description: ''
+      description: Register a new benchmark.
       parameters: []
       requestBody:
         content:
@@ -1763,6 +1711,81 @@ paths:
           required: true
           schema:
             type: string
+  /v1/graders/types:
+    get:
+      responses:
+        '200':
+          description: >-
+            A list of grader types and information about the types.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListGraderTypesResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Graders
+      description: List all grader types.
+      parameters: []
+  /v1/graders:
+    get:
+      responses:
+        '200':
+          description: A list of graders.
+          content:
+            application/jsonl:
+              schema:
+                $ref: '#/components/schemas/Grader'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Graders
+      description: List all graders.
+      parameters: []
+    post:
+      responses:
+        '200':
+          description: The registered grader.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Grader'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Graders
+      description: Register a new grader.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterGraderRequest'
+        required: true
   /v1/models:
     get:
       responses:
@@ -1893,53 +1916,6 @@ paths:
           required: false
           schema:
             $ref: '#/components/schemas/URL'
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ScoringFunctions
-      description: ''
-      parameters: []
-    post:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ScoringFunctions
-      description: ''
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
-        required: true
   /v1/shields:
     get:
       responses:
@@ -2345,16 +2321,15 @@ paths:
             schema:
               $ref: '#/components/schemas/ResumeAgentTurnRequest'
         required: true
-  /v1/eval/benchmarks/{benchmark_id}/jobs:
+  /v1/evaluation/run:
     post:
       responses:
         '200':
-          description: >-
-            The job that was created to run the evaluation.
+          description: OK
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Job'
+                $ref: '#/components/schemas/EvaluationJob'
         '400':
           $ref: '#/components/responses/BadRequest400'
         '429':
@@ -2366,21 +2341,43 @@ paths:
         default:
           $ref: '#/components/responses/DefaultError'
       tags:
-        - Eval
-      description: Run an evaluation on a benchmark.
-      parameters:
-        - name: benchmark_id
-          in: path
-          description: >-
-            The ID of the benchmark to run the evaluation on.
-          required: true
-          schema:
-            type: string
+        - Evaluation
+      description: Run an evaluation job.
+      parameters: []
       requestBody:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/RunEvalRequest'
+              $ref: '#/components/schemas/RunRequest'
+        required: true
+  /v1/evaluation/run_inline:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluationResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Evaluation
+      description: Run an evaluation job inline.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunInlineRequest'
         required: true
   /v1/safety/run-shield:
     post:
@@ -2436,65 +2433,6 @@ paths:
             schema:
               $ref: '#/components/schemas/SaveSpansToDatasetRequest'
         required: true
-  /v1/scoring/score:
-    post:
-      responses:
-        '200':
-          description: >-
-            ScoreResponse object containing rows and aggregated results
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Scoring
-      description: Score a list of rows.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
-  /v1/scoring/score-batch:
-    post:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreBatchResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Scoring
-      description: ''
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
-        required: true
   /v1/post-training/supervised-fine-tune:
     post:
       responses:
@@ -4384,251 +4322,6 @@ components:
       title: EmbeddingsResponse
       description: >-
         Response containing generated embeddings.
-    AgentCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: agent
-          default: agent
-        config:
-          $ref: '#/components/schemas/AgentConfig'
-          description: >-
-            The configuration for the agent candidate.
-      additionalProperties: false
-      required:
-        - type
-        - config
-      title: AgentCandidate
-      description: An agent candidate for evaluation.
-    AggregationFunctionType:
-      type: string
-      enum:
-        - average
-        - median
-        - categorical_count
-        - accuracy
-      title: AggregationFunctionType
-    BasicScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: basic
-          default: basic
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-      title: BasicScoringFnParams
-    BenchmarkConfig:
-      type: object
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-          description: The candidate to evaluate.
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-          description: >-
-            Map between scoring function id and parameters for each scoring function
-            you want to run
-        num_examples:
-          type: integer
-          description: >-
-            (Optional) The number of examples to evaluate. If not provided, all examples
-            in the dataset will be evaluated
-      additionalProperties: false
-      required:
-        - eval_candidate
-        - scoring_params
-      title: BenchmarkConfig
-      description: >-
-        A benchmark configuration for evaluation.
-    EvalCandidate:
-      oneOf:
-        - $ref: '#/components/schemas/ModelCandidate'
-        - $ref: '#/components/schemas/AgentCandidate'
-      discriminator:
-        propertyName: type
-        mapping:
-          model: '#/components/schemas/ModelCandidate'
-          agent: '#/components/schemas/AgentCandidate'
-    LLMAsJudgeScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
-          default: llm_as_judge
-        judge_model:
-          type: string
-        prompt_template:
-          type: string
-        judge_score_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-        - judge_model
-      title: LLMAsJudgeScoringFnParams
-    ModelCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: model
-          default: model
-        model:
-          type: string
-          description: The model ID to evaluate.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model.
-        system_message:
-          $ref: '#/components/schemas/SystemMessage'
-          description: >-
-            (Optional) The system message providing instructions or context to the
-            model.
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
-    RegexParserScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: regex_parser
-          default: regex_parser
-        parsing_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-      title: RegexParserScoringFnParams
-    ScoringFnParams:
-      oneOf:
-        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        - $ref: '#/components/schemas/BasicScoringFnParams'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-          basic: '#/components/schemas/BasicScoringFnParams'
-    EvaluateRowsRequest:
-      type: object
-      properties:
-        input_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The rows to evaluate.
-        scoring_functions:
-          type: array
-          items:
-            type: string
-          description: >-
-            The scoring functions to use for the evaluation.
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark.
-      additionalProperties: false
-      required:
-        - input_rows
-        - scoring_functions
-        - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      type: object
-      properties:
-        generations:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The generations from the evaluation.
-        scores:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: The scores from the evaluation.
-      additionalProperties: false
-      required:
-        - generations
-        - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    ScoringResult:
-      type: object
-      properties:
-        score_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: >-
-            The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Map of metric name to aggregated value
-      additionalProperties: false
-      required:
-        - score_rows
-        - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
     Agent:
       type: object
       properties:
@@ -4703,10 +4396,14 @@ components:
           default: benchmark
         dataset_id:
           type: string
-        scoring_functions:
+          description: >-
+            The ID of the dataset to used to run the benchmark.
+        grader_ids:
           type: array
           items:
             type: string
+          description: >-
+            The grader ids to use for this benchmark.
         metadata:
           type: object
           additionalProperties:
@@ -4717,6 +4414,8 @@ components:
               - type: string
               - type: array
               - type: object
+          description: >-
+            Metadata for this benchmark for additional descriptions.
       additionalProperties: false
       required:
         - identifier
@@ -4724,7 +4423,7 @@ components:
         - provider_id
         - type
         - dataset_id
-        - scoring_functions
+        - grader_ids
         - metadata
       title: Benchmark
     DataSource:
@@ -4828,6 +4527,255 @@ components:
       title: URIDataSource
       description: >-
         A dataset that can be obtained from a URI.
+    EqualityGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: equality
+          default: equality
+        equality:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - aggregation_functions
+          title: BasicGraderParams
+      additionalProperties: false
+      required:
+        - type
+        - equality
+      title: EqualityGrader
+    FactualityGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: factuality
+          default: factuality
+        factuality:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - aggregation_functions
+          title: BasicGraderParams
+      additionalProperties: false
+      required:
+        - type
+        - factuality
+      title: FactualityGrader
+    FaithfulnessGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: faithfulness
+          default: faithfulness
+        faithfulness:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - aggregation_functions
+          title: BasicGraderParams
+      additionalProperties: false
+      required:
+        - type
+        - faithfulness
+      title: FaithfulnessGrader
+    Grader:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: grader
+          default: grader
+        grader:
+          $ref: '#/components/schemas/GraderDefinition'
+        description:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - grader
+        - metadata
+      title: Grader
+    GraderDefinition:
+      oneOf:
+        - $ref: '#/components/schemas/LlmGrader'
+        - $ref: '#/components/schemas/RegexParserGrader'
+        - $ref: '#/components/schemas/EqualityGrader'
+        - $ref: '#/components/schemas/SubsetOfGrader'
+        - $ref: '#/components/schemas/FactualityGrader'
+        - $ref: '#/components/schemas/FaithfulnessGrader'
+      discriminator:
+        propertyName: type
+        mapping:
+          llm: '#/components/schemas/LlmGrader'
+          regex_parser: '#/components/schemas/RegexParserGrader'
+          equality: '#/components/schemas/EqualityGrader'
+          subset_of: '#/components/schemas/SubsetOfGrader'
+          factuality: '#/components/schemas/FactualityGrader'
+          faithfulness: '#/components/schemas/FaithfulnessGrader'
+    LlmGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: llm
+          default: llm
+        llm:
+          type: object
+          properties:
+            model:
+              type: string
+            prompt:
+              type: string
+            score_regexes:
+              type: array
+              items:
+                type: string
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - model
+            - prompt
+            - score_regexes
+            - aggregation_functions
+          title: LlmGraderParams
+      additionalProperties: false
+      required:
+        - type
+        - llm
+      title: LlmGrader
+    RegexParserGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: regex_parser
+          default: regex_parser
+        regex_parser:
+          type: object
+          properties:
+            parsing_regexes:
+              type: array
+              items:
+                type: string
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - parsing_regexes
+            - aggregation_functions
+          title: RegexParserGraderParams
+      additionalProperties: false
+      required:
+        - type
+        - regex_parser
+      title: RegexParserGrader
+    SubsetOfGrader:
+      type: object
+      properties:
+        type:
+          type: string
+          const: subset_of
+          default: subset_of
+        subset_of:
+          type: object
+          properties:
+            aggregation_functions:
+              type: array
+              items:
+                type: string
+                enum:
+                  - average
+                  - median
+                  - categorical_count
+                  - accuracy
+                title: AggregationFunctionType
+                description: A type of aggregation function.
+          additionalProperties: false
+          required:
+            - aggregation_functions
+          title: BasicGraderParams
+      additionalProperties: false
+      required:
+        - type
+        - subset_of
+      title: SubsetOfGrader
     Model:
       type: object
       properties:
@@ -4869,179 +4817,6 @@ components:
         - llm
         - embedding
       title: ModelType
-    AgentTurnInputType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: agent_turn_input
-          default: agent_turn_input
-      additionalProperties: false
-      required:
-        - type
-      title: AgentTurnInputType
-    ArrayType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: array
-          default: array
-      additionalProperties: false
-      required:
-        - type
-      title: ArrayType
-    BooleanType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: boolean
-          default: boolean
-      additionalProperties: false
-      required:
-        - type
-      title: BooleanType
-    ChatCompletionInputType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: chat_completion_input
-          default: chat_completion_input
-      additionalProperties: false
-      required:
-        - type
-      title: ChatCompletionInputType
-    CompletionInputType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: completion_input
-          default: completion_input
-      additionalProperties: false
-      required:
-        - type
-      title: CompletionInputType
-    JsonType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: json
-          default: json
-      additionalProperties: false
-      required:
-        - type
-      title: JsonType
-    NumberType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: number
-          default: number
-      additionalProperties: false
-      required:
-        - type
-      title: NumberType
-    ObjectType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: object
-          default: object
-      additionalProperties: false
-      required:
-        - type
-      title: ObjectType
-    ParamType:
-      oneOf:
-        - $ref: '#/components/schemas/StringType'
-        - $ref: '#/components/schemas/NumberType'
-        - $ref: '#/components/schemas/BooleanType'
-        - $ref: '#/components/schemas/ArrayType'
-        - $ref: '#/components/schemas/ObjectType'
-        - $ref: '#/components/schemas/JsonType'
-        - $ref: '#/components/schemas/UnionType'
-        - $ref: '#/components/schemas/ChatCompletionInputType'
-        - $ref: '#/components/schemas/CompletionInputType'
-        - $ref: '#/components/schemas/AgentTurnInputType'
-      discriminator:
-        propertyName: type
-        mapping:
-          string: '#/components/schemas/StringType'
-          number: '#/components/schemas/NumberType'
-          boolean: '#/components/schemas/BooleanType'
-          array: '#/components/schemas/ArrayType'
-          object: '#/components/schemas/ObjectType'
-          json: '#/components/schemas/JsonType'
-          union: '#/components/schemas/UnionType'
-          chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-          completion_input: '#/components/schemas/CompletionInputType'
-          agent_turn_input: '#/components/schemas/AgentTurnInputType'
-    ScoringFn:
-      type: object
-      properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
-        type:
-          type: string
-          const: scoring_function
-          default: scoring_function
-        description:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-        return_type:
-          $ref: '#/components/schemas/ParamType'
-        params:
-          $ref: '#/components/schemas/ScoringFnParams'
-      additionalProperties: false
-      required:
-        - identifier
-        - provider_resource_id
-        - provider_id
-        - type
-        - metadata
-        - return_type
-      title: ScoringFn
-    StringType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: string
-          default: string
-      additionalProperties: false
-      required:
-        - type
-      title: StringType
-    UnionType:
-      type: object
-      properties:
-        type:
-          type: string
-          const: union
-          default: union
-      additionalProperties: false
-      required:
-        - type
-      title: UnionType
     Shield:
       type: object
       properties:
@@ -5292,21 +5067,20 @@ components:
         - checkpoints
       title: PostTrainingJobArtifactsResponse
       description: Artifacts of a finetuning job.
-    JobStatus:
-      type: string
-      enum:
-        - completed
-        - in_progress
-        - failed
-        - scheduled
-      title: JobStatus
     PostTrainingJobStatusResponse:
       type: object
       properties:
         job_uuid:
           type: string
         status:
-          $ref: '#/components/schemas/JobStatus'
+          type: string
+          enum:
+            - completed
+            - in_progress
+            - failed
+            - scheduled
+            - cancelled
+          title: JobStatus
         scheduled_at:
           type: string
           format: date-time
@@ -5381,6 +5155,255 @@ components:
         - embedding_model
         - embedding_dimension
       title: VectorDB
+    BenchmarkTask:
+      type: object
+      properties:
+        type:
+          type: string
+          const: benchmark_id
+          default: benchmark_id
+        benchmark_id:
+          type: string
+      additionalProperties: false
+      required:
+        - type
+        - benchmark_id
+      title: BenchmarkTask
+    DataSourceGraderTask:
+      type: object
+      properties:
+        type:
+          type: string
+          const: data_source_grader
+          default: data_source_grader
+        data_source:
+          $ref: '#/components/schemas/DataSource'
+        grader_ids:
+          type: array
+          items:
+            type: string
+      additionalProperties: false
+      required:
+        - type
+        - data_source
+        - grader_ids
+      title: DataSourceGraderTask
+    DatasetGraderTask:
+      type: object
+      properties:
+        type:
+          type: string
+          const: dataset_grader
+          default: dataset_grader
+        dataset_id:
+          type: string
+        grader_ids:
+          type: array
+          items:
+            type: string
+      additionalProperties: false
+      required:
+        - type
+        - dataset_id
+        - grader_ids
+      title: DatasetGraderTask
+    EvaluationTask:
+      oneOf:
+        - $ref: '#/components/schemas/BenchmarkTask'
+        - $ref: '#/components/schemas/DatasetGraderTask'
+        - $ref: '#/components/schemas/DataSourceGraderTask'
+      discriminator:
+        propertyName: type
+        mapping:
+          benchmark_id: '#/components/schemas/BenchmarkTask'
+          dataset_grader: '#/components/schemas/DatasetGraderTask'
+          data_source_grader: '#/components/schemas/DataSourceGraderTask'
+    GradeRequest:
+      type: object
+      properties:
+        task:
+          $ref: '#/components/schemas/EvaluationTask'
+          description: >-
+            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+      additionalProperties: false
+      required:
+        - task
+      title: GradeRequest
+    AgentCandidate:
+      type: object
+      properties:
+        type:
+          type: string
+          const: agent
+          default: agent
+        config:
+          $ref: '#/components/schemas/AgentConfig'
+          description: >-
+            The configuration for the agent candidate.
+      additionalProperties: false
+      required:
+        - type
+        - config
+      title: AgentCandidate
+      description: An agent candidate for evaluation.
+    EvaluationCandidate:
+      oneOf:
+        - $ref: '#/components/schemas/ModelCandidate'
+        - $ref: '#/components/schemas/AgentCandidate'
+      discriminator:
+        propertyName: type
+        mapping:
+          model: '#/components/schemas/ModelCandidate'
+          agent: '#/components/schemas/AgentCandidate'
+    EvaluationJob:
+      type: object
+      properties:
+        id:
+          type: string
+          description: The ID of the job.
+        status:
+          type: string
+          enum:
+            - completed
+            - in_progress
+            - failed
+            - scheduled
+            - cancelled
+          description: The status of the job.
+        created_at:
+          type: string
+          format: date-time
+          description: The time the job was created.
+        ended_at:
+          type: string
+          format: date-time
+          description: The time the job ended.
+        error:
+          type: string
+          description: >-
+            If status of the job is failed, this will contain the error message.
+        type:
+          type: string
+          const: evaluation
+          default: evaluation
+        task:
+          $ref: '#/components/schemas/EvaluationTask'
+        candidate:
+          $ref: '#/components/schemas/EvaluationCandidate'
+      additionalProperties: false
+      required:
+        - id
+        - status
+        - created_at
+        - type
+        - task
+        - candidate
+      title: EvaluationJob
+    ModelCandidate:
+      type: object
+      properties:
+        type:
+          type: string
+          const: model
+          default: model
+        model_id:
+          type: string
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+          description: The sampling parameters for the model.
+        system_message:
+          $ref: '#/components/schemas/SystemMessage'
+          description: >-
+            (Optional) The system message providing instructions or context to the
+            model.
+      additionalProperties: false
+      required:
+        - type
+        - model_id
+        - sampling_params
+      title: ModelCandidate
+      description: A model candidate for evaluation.
+    GradeInlineRequest:
+      type: object
+      properties:
+        task:
+          $ref: '#/components/schemas/EvaluationTask'
+          description: >-
+            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+      additionalProperties: false
+      required:
+        - task
+      title: GradeInlineRequest
+    EvaluationResponse:
+      type: object
+      properties:
+        generations:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The generations in rows for the evaluation.
+        scores:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+          description: >-
+            The scores for the evaluation. Map of grader id to ScoringResult.
+      additionalProperties: false
+      required:
+        - generations
+        - scores
+      title: EvaluationResponse
+      description: A response to an inline evaluation.
+    ScoringResult:
+      type: object
+      properties:
+        scores:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+          description: >-
+            The scoring result for each row. Each row is a map of grader column name
+            to value.
+        metrics:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Map of metric name to aggregated value.
+      additionalProperties: false
+      required:
+        - scores
+        - metrics
+      title: ScoringResult
+      description: A scoring result for a single row.
     HealthInfo:
       type: object
       properties:
@@ -5648,6 +5671,56 @@ components:
       title: ListFileResponse
       description: >-
         Response representing a list of file entries.
+    GraderTypeInfo:
+      type: object
+      properties:
+        grader_type:
+          type: string
+          enum:
+            - llm
+            - regex_parser
+            - equality
+            - subset_of
+            - factuality
+            - faithfulness
+          title: GraderType
+          description: >-
+            A type of grader. Each type is a criteria for evaluating answers.
+        description:
+          type: string
+          description: >-
+            A description of the grader type. - E.g. Write your custom judge prompt
+            to score the answer.
+        supported_dataset_purposes:
+          type: array
+          items:
+            type: string
+            enum:
+              - post-training/messages
+              - eval/question-answer
+              - eval/messages-answer
+            title: DatasetPurpose
+            description: >-
+              Purpose of the dataset. Each purpose has a required input data schema.
+          description: >-
+            The purposes that this grader can be used for.
+      additionalProperties: false
+      required:
+        - grader_type
+        - description
+        - supported_dataset_purposes
+      title: GraderTypeInfo
+    ListGraderTypesResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/GraderTypeInfo'
+      additionalProperties: false
+      required:
+        - data
+      title: ListGraderTypesResponse
     ListModelsResponse:
       type: object
       properties:
@@ -5698,17 +5771,6 @@ components:
       required:
         - data
       title: ListRoutesResponse
-    ListScoringFunctionsResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-      additionalProperties: false
-      required:
-        - data
-      title: ListScoringFunctionsResponse
     ListShieldsResponse:
       type: object
       properties:
@@ -6343,18 +6405,21 @@ components:
     RegisterBenchmarkRequest:
       type: object
       properties:
-        benchmark_id:
-          type: string
         dataset_id:
           type: string
-        scoring_functions:
+          description: >-
+            The ID of the dataset to used to run the benchmark.
+        grader_ids:
           type: array
           items:
             type: string
-        provider_benchmark_id:
-          type: string
-        provider_id:
+          description: >-
+            List of grader ids to use for this benchmark.
+        benchmark_id:
           type: string
+          description: >-
+            (Optional) The ID of the benchmark to register. If not provided, an ID
+            will be generated.
         metadata:
           type: object
           additionalProperties:
@@ -6365,11 +6430,12 @@ components:
               - type: string
               - type: array
               - type: object
+          description: >-
+            (Optional) Metadata for this benchmark for additional descriptions.
       additionalProperties: false
       required:
-        - benchmark_id
         - dataset_id
-        - scoring_functions
+        - grader_ids
       title: RegisterBenchmarkRequest
     RegisterDatasetRequest:
       type: object
@@ -6422,6 +6488,37 @@ components:
         - purpose
         - source
       title: RegisterDatasetRequest
+    RegisterGraderRequest:
+      type: object
+      properties:
+        grader:
+          $ref: '#/components/schemas/GraderDefinition'
+          description: >-
+            The grader definition, E.g. - { "type": "llm", "llm": { "model": "llama-405b",
+            "prompt": "You are a judge. Score the answer based on the question. {question}
+            {answer}", } }
+        grader_id:
+          type: string
+          description: >-
+            (Optional) The ID of the grader. If not provided, a random ID will be
+            generated.
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: >-
+            (Optional) Any additional metadata for this grader. - E.g. { "description":
+            "A grader that scores the answer based on the question.", }
+      additionalProperties: false
+      required:
+        - grader
+      title: RegisterGraderRequest
     RegisterModelRequest:
       type: object
       properties:
@@ -6447,27 +6544,6 @@ components:
       required:
         - model_id
       title: RegisterModelRequest
-    RegisterScoringFunctionRequest:
-      type: object
-      properties:
-        scoring_fn_id:
-          type: string
-        description:
-          type: string
-        return_type:
-          $ref: '#/components/schemas/ParamType'
-        provider_scoring_fn_id:
-          type: string
-        provider_id:
-          type: string
-        params:
-          $ref: '#/components/schemas/ScoringFnParams'
-      additionalProperties: false
-      required:
-        - scoring_fn_id
-        - description
-        - return_type
-      title: RegisterScoringFunctionRequest
     RegisterShieldRequest:
       type: object
       properties:
@@ -6549,25 +6625,42 @@ components:
       required:
         - tool_responses
       title: ResumeAgentTurnRequest
-    RunEvalRequest:
+    RunRequest:
       type: object
       properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark.
+        task:
+          $ref: '#/components/schemas/EvaluationTask'
+          description: >-
+            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        candidate:
+          $ref: '#/components/schemas/EvaluationCandidate'
+          description: The candidate to evaluate.
       additionalProperties: false
       required:
-        - benchmark_config
-      title: RunEvalRequest
-    Job:
+        - task
+        - candidate
+      title: RunRequest
+    RunInlineRequest:
       type: object
       properties:
-        job_id:
-          type: string
+        task:
+          $ref: '#/components/schemas/EvaluationTask'
+          description: >-
+            The task to evaluate. One of: - BenchmarkTask: Run evaluation task against
+            a benchmark_id - DatasetGraderTask: Run evaluation task against a dataset_id
+            and a list of grader_ids - DataSourceGraderTask: Run evaluation task against
+            a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        candidate:
+          $ref: '#/components/schemas/EvaluationCandidate'
+          description: The candidate to evaluate.
       additionalProperties: false
       required:
-        - job_id
-      title: Job
+        - task
+        - candidate
+      title: RunInlineRequest
     RunShieldRequest:
       type: object
       properties:
@@ -6621,81 +6714,6 @@ components:
         - attributes_to_save
         - dataset_id
       title: SaveSpansToDatasetRequest
-    ScoreRequest:
-      type: object
-      properties:
-        input_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-          description: The rows to score.
-        scoring_functions:
-          type: object
-          additionalProperties:
-            oneOf:
-              - $ref: '#/components/schemas/ScoringFnParams'
-              - type: 'null'
-          description: >-
-            The scoring functions to use for the scoring.
-      additionalProperties: false
-      required:
-        - input_rows
-        - scoring_functions
-      title: ScoreRequest
-    ScoreResponse:
-      type: object
-      properties:
-        results:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          description: >-
-            A map of scoring function name to ScoringResult.
-      additionalProperties: false
-      required:
-        - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoreBatchRequest:
-      type: object
-      properties:
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: object
-          additionalProperties:
-            oneOf:
-              - $ref: '#/components/schemas/ScoringFnParams'
-              - type: 'null'
-        save_results_dataset:
-          type: boolean
-      additionalProperties: false
-      required:
-        - dataset_id
-        - scoring_functions
-        - save_results_dataset
-      title: ScoreBatchRequest
-    ScoreBatchResponse:
-      type: object
-      properties:
-        dataset_id:
-          type: string
-        results:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-      additionalProperties: false
-      required:
-        - results
-      title: ScoreBatchResponse
     AlgorithmConfig:
       oneOf:
         - $ref: '#/components/schemas/LoraFinetuningConfig'
@@ -6933,10 +6951,9 @@ tags:
   - name: Benchmarks
   - name: DatasetIO
   - name: Datasets
-  - name: Eval
-    x-displayName: >-
-      Llama Stack Evaluation API for running evaluations on model and agent candidates.
+  - name: Evaluation
   - name: Files
+  - name: Graders
   - name: Inference
     description: >-
       This API provides the raw interface to the underlying models. Two kinds of models
@@ -6956,8 +6973,6 @@ tags:
     x-displayName: >-
       Providers API for inspecting, listing, and modifying providers and their configurations.
   - name: Safety
-  - name: Scoring
-  - name: ScoringFunctions
   - name: Shields
   - name: SyntheticDataGeneration (Coming Soon)
   - name: Telemetry
@@ -6973,16 +6988,15 @@ x-tagGroups:
       - Benchmarks
       - DatasetIO
       - Datasets
-      - Eval
+      - Evaluation
       - Files
+      - Graders
       - Inference
       - Inspect
       - Models
       - PostTraining (Coming Soon)
       - Providers
       - Safety
-      - Scoring
-      - ScoringFunctions
       - Shields
       - SyntheticDataGeneration (Coming Soon)
       - Telemetry
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 39ba355e9..eaaf8530b 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -12,11 +12,17 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 
 class CommonBenchmarkFields(BaseModel):
+    """
+    :param dataset_id: The ID of the dataset to used to run the benchmark.
+    :param grader_ids: The grader ids to use for this benchmark.
+    :param metadata: Metadata for this benchmark for additional descriptions.
+    """
+
     dataset_id: str
-    scoring_functions: List[str]
+    grader_ids: List[str]
     metadata: Dict[str, Any] = Field(
         default_factory=dict,
-        description="Metadata for this evaluation task",
+        description="Metadata for this benchmark",
     )
 
 
@@ -45,22 +51,39 @@ class ListBenchmarksResponse(BaseModel):
 
 @runtime_checkable
 class Benchmarks(Protocol):
+    @webmethod(route="/eval/benchmarks", method="POST")
+    async def register_benchmark(
+        self,
+        dataset_id: str,
+        grader_ids: List[str],
+        benchmark_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> Benchmark:
+        """
+        Register a new benchmark.
+
+        :param dataset_id: The ID of the dataset to used to run the benchmark.
+        :param grader_ids: List of grader ids to use for this benchmark.
+        :param benchmark_id: (Optional) The ID of the benchmark to register. If not provided, an ID will be generated.
+        :param metadata: (Optional) Metadata for this benchmark for additional descriptions.
+        """
+        ...
+
     @webmethod(route="/eval/benchmarks", method="GET")
-    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
+        """
+        List all benchmarks.
+        """
+        ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
     async def get_benchmark(
         self,
         benchmark_id: str,
-    ) -> Optional[Benchmark]: ...
+    ) -> Benchmark:
+        """
+        Get a benchmark by ID.
 
-    @webmethod(route="/eval/benchmarks", method="POST")
-    async def register_benchmark(
-        self,
-        benchmark_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
+        :param benchmark_id: The ID of the benchmark to get.
+        """
+        ...
diff --git a/llama_stack/apis/common/job_types.py b/llama_stack/apis/common/job_types.py
index bc070017b..e27f19493 100644
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@@ -3,21 +3,49 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from datetime import datetime
 from enum import Enum
+from typing import Optional
 
 from pydantic import BaseModel
 
 from llama_stack.schema_utils import json_schema_type
 
 
-@json_schema_type
-class Job(BaseModel):
-    job_id: str
+class JobType(Enum):
+    batch_inference = "batch_inference"
+    evaluation = "evaluation"
+    finetuning = "finetuning"
 
 
-@json_schema_type
 class JobStatus(Enum):
     completed = "completed"
     in_progress = "in_progress"
     failed = "failed"
     scheduled = "scheduled"
+    cancelled = "cancelled"
+
+
+class JobArtifact(BaseModel):
+    """
+    A job artifact is a file or directory that is produced by a job.
+    """
+
+    path: str
+
+
+@json_schema_type
+class CommonJobFields(BaseModel):
+    """Common fields for all jobs.
+    :param id: The ID of the job.
+    :param status: The status of the job.
+    :param created_at: The time the job was created.
+    :param ended_at: The time the job ended.
+    :param error: If status of the job is failed, this will contain the error message.
+    """
+
+    id: str
+    status: JobStatus
+    created_at: datetime
+    ended_at: Optional[datetime] = None
+    error: Optional[str] = None
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index dec018d83..5b4433041 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -10,7 +10,7 @@ from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
 from llama_stack.apis.agents import AgentConfig
-from llama_stack.apis.common.job_types import Job, JobStatus
+from llama_stack.apis.common.job_types import JobStatus
 from llama_stack.apis.inference import SamplingParams, SystemMessage
 from llama_stack.apis.scoring import ScoringResult
 from llama_stack.apis.scoring_functions import ScoringFnParams
@@ -91,7 +91,7 @@ class Eval(Protocol):
         self,
         benchmark_id: str,
         benchmark_config: BenchmarkConfig,
-    ) -> Job:
+    ) -> None:
         """Run an evaluation on a benchmark.
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
@@ -135,7 +135,9 @@ class Eval(Protocol):
         """
         ...
 
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    @webmethod(
+        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET"
+    )
     async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
         """Get the result of a job.
 
diff --git a/llama_stack/apis/evaluation/__init__.py b/llama_stack/apis/evaluation/__init__.py
new file mode 100644
index 000000000..9a168a2bc
--- /dev/null
+++ b/llama_stack/apis/evaluation/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .evaluation import *  # noqa: F401 F403
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
new file mode 100644
index 000000000..444495b6e
--- /dev/null
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -0,0 +1,175 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, Union
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+from llama_stack.apis.agents import AgentConfig
+from llama_stack.apis.common.job_types import CommonJobFields, JobType
+from llama_stack.apis.datasets import DataSource
+from llama_stack.apis.inference import SamplingParams, SystemMessage
+from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+
+
+@json_schema_type
+class ModelCandidate(BaseModel):
+    """A model candidate for evaluation.
+
+    :param model: The model ID to evaluate.
+    :param sampling_params: The sampling parameters for the model.
+    :param system_message: (Optional) The system message providing instructions or context to the model.
+    """
+
+    type: Literal["model"] = "model"
+    model_id: str
+    sampling_params: SamplingParams
+    system_message: Optional[SystemMessage] = None
+
+
+@json_schema_type
+class AgentCandidate(BaseModel):
+    """An agent candidate for evaluation.
+
+    :param config: The configuration for the agent candidate.
+    """
+
+    type: Literal["agent"] = "agent"
+    config: AgentConfig
+
+
+EvaluationCandidate = register_schema(
+    Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")],
+    name="EvaluationCandidate",
+)
+
+
+@json_schema_type
+class BenchmarkTask(BaseModel):
+    type: Literal["benchmark_id"] = "benchmark_id"
+    benchmark_id: str
+
+
+@json_schema_type
+class DatasetGraderTask(BaseModel):
+    type: Literal["dataset_grader"] = "dataset_grader"
+    dataset_id: str
+    grader_ids: List[str]
+
+
+@json_schema_type
+class DataSourceGraderTask(BaseModel):
+    type: Literal["data_source_grader"] = "data_source_grader"
+    data_source: DataSource
+    grader_ids: List[str]
+
+
+EvaluationTask = register_schema(
+    Annotated[
+        Union[BenchmarkTask, DatasetGraderTask, DataSourceGraderTask],
+        Field(discriminator="type"),
+    ],
+    name="EvaluationTask",
+)
+
+
+@json_schema_type
+class EvaluationJob(CommonJobFields):
+    type: Literal[JobType.evaluation.value] = JobType.evaluation.value
+
+    # input params for the submitted evaluation job
+    task: EvaluationTask
+    candidate: EvaluationCandidate
+
+
+@json_schema_type
+class ScoringResult(BaseModel):
+    """
+    A scoring result for a single row.
+
+    :param scores: The scoring result for each row. Each row is a map of grader column name to value.
+    :param metrics: Map of metric name to aggregated value.
+    """
+
+    scores: List[Dict[str, Any]]
+    metrics: Dict[str, Any]
+
+
+@json_schema_type
+class EvaluationResponse(BaseModel):
+    """
+    A response to an inline evaluation.
+
+    :param generations: The generations in rows for the evaluation.
+    :param scores: The scores for the evaluation. Map of grader id to ScoringResult.
+    """
+
+    generations: List[Dict[str, Any]]
+    scores: Dict[str, ScoringResult]
+
+
+class Evaluation(Protocol):
+    @webmethod(route="/evaluation/run", method="POST")
+    async def run(
+        self,
+        task: EvaluationTask,
+        candidate: EvaluationCandidate,
+    ) -> EvaluationJob:
+        """
+        Run an evaluation job.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param candidate: The candidate to evaluate.
+        """
+        ...
+
+    @webmethod(route="/evaluation/run_inline", method="POST")
+    async def run_inline(
+        self,
+        task: EvaluationTask,
+        candidate: EvaluationCandidate,
+    ) -> EvaluationResponse:
+        """
+        Run an evaluation job inline.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+        :param candidate: The candidate to evaluate.
+        """
+        ...
+
+    @webmethod(route="/evaluation/grade", method="POST")
+    async def grade(self, task: EvaluationTask) -> EvaluationJob:
+        """
+        Run an grading job with generated results. Use this when you have generated results from inference in a dataset.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+
+        :return: The evaluation job containing grader scores.
+        """
+        ...
+
+    @webmethod(route="/evaluation/grade_inline", method="POST")
+    async def grade_inline(self, task: EvaluationTask) -> EvaluationResponse:
+        """
+        Run an grading job with generated results inline.
+
+        :param task: The task to evaluate. One of:
+         - BenchmarkTask: Run evaluation task against a benchmark_id
+         - DatasetGraderTask: Run evaluation task against a dataset_id and a list of grader_ids
+         - DataSourceGraderTask: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids
+
+        :return: The evaluation job containing grader scores. "generations" is not populated in the response.
+        """
+        ...
diff --git a/llama_stack/apis/graders/__init__.py b/llama_stack/apis/graders/__init__.py
new file mode 100644
index 000000000..b5791cb88
--- /dev/null
+++ b/llama_stack/apis/graders/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .graders import *  # noqa: F401 F403
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 9c9289a77..cd1c58348 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -17,16 +17,15 @@ from llama_stack.apis.batch_inference import BatchInference
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval import Eval
+from llama_stack.apis.evaluation import Evaluation
 from llama_stack.apis.files import Files
+from llama_stack.apis.graders import Graders
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.providers import Providers
 from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
 from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
 from llama_stack.apis.telemetry import Telemetry
@@ -56,10 +55,7 @@ class LlamaStack(
     Telemetry,
     PostTraining,
     VectorIO,
-    Eval,
     Benchmarks,
-    Scoring,
-    ScoringFunctions,
     DatasetIO,
     Models,
     Shields,
@@ -68,6 +64,8 @@ class LlamaStack(
     ToolRuntime,
     RAGToolRuntime,
     Files,
+    Graders,
+    Evaluation,
 ):
     pass
 
@@ -113,7 +111,9 @@ class EnvVarError(Exception):
     def __init__(self, var_name: str, path: str = ""):
         self.var_name = var_name
         self.path = path
-        super().__init__(f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}")
+        super().__init__(
+            f"Environment variable '{var_name}' not set or empty{f' at {path}' if path else ''}"
+        )
 
 
 def redact_sensitive_fields(data: Dict[str, Any]) -> Dict[str, Any]:
@@ -204,7 +204,9 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
         if not key:
             raise ValueError(f"Empty key in environment variable pair: {env_pair}")
         if not all(c.isalnum() or c == "_" for c in key):
-            raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}")
+            raise ValueError(
+                f"Key must contain only alphanumeric characters and underscores: {key}"
+            )
         return key, value
     except ValueError as e:
         raise ValueError(
@@ -217,14 +219,20 @@ def validate_env_pair(env_pair: str) -> tuple[str, str]:
 async def construct_stack(
     run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None
 ) -> Dict[Api, Any]:
-    dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
-    impls = await resolve_impls(run_config, provider_registry or get_provider_registry(), dist_registry)
+    dist_registry, _ = await create_dist_registry(
+        run_config.metadata_store, run_config.image_name
+    )
+    impls = await resolve_impls(
+        run_config, provider_registry or get_provider_registry(), dist_registry
+    )
     await register_resources(run_config, impls)
     return impls
 
 
 def get_stack_run_config_from_template(template: str) -> StackRunConfig:
-    template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+    template_path = (
+        importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+    )
 
     with importlib.resources.as_file(template_path) as path:
         if not path.exists():
@@ -267,7 +275,9 @@ def run_config_from_adhoc_config_spec(
 
         # call method "sample_run_config" on the provider spec config class
         provider_config_type = instantiate_class_type(provider_spec.config_class)
-        provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir))
+        provider_config = replace_env_vars(
+            provider_config_type.sample_run_config(__distro_dir__=distro_dir)
+        )
 
         provider_configs_by_api[api_str] = [
             Provider(