diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 98270f7b8..b93f6a380 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -40,6 +40,286 @@
         }
     ],
     "paths": {
+        "/v1/eval/tasks/{task_id}/evaluations": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/DeprecatedEvaluateRowsRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
+        "/v1/eval-tasks/{task_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Benchmark"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "eval_task_id",
+                        "in": "query",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/JobStatus"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval-tasks": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [],
+                "deprecated": true
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
+        "/v1/eval/tasks/{task_id}/jobs": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Job"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/DeprecatedRunEvalRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
         "/v1/datasetio/rows": {
             "get": {
                 "responses": {
@@ -530,7 +810,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/evaluations": {
+        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
             "post": {
                 "responses": {
                     "200": {
@@ -550,7 +830,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -670,6 +950,43 @@
                 ]
             }
         },
+        "/v1/eval/benchmarks/{benchmark_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Benchmark"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/datasets/{dataset_id}": {
             "get": {
                 "responses": {
@@ -728,43 +1045,6 @@
                 ]
             }
         },
-        "/v1/eval-tasks/{eval_task_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/EvalTask"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "EvalTasks"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "eval_task_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/v1/models/{model_id}": {
             "get": {
                 "responses": {
@@ -1348,7 +1628,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1375,7 +1655,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1404,7 +1684,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1422,7 +1702,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
             "get": {
                 "responses": {
                     "200": {
@@ -1442,7 +1722,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "job_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1450,7 +1730,7 @@
                         }
                     },
                     {
-                        "name": "task_id",
+                        "name": "job_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1460,6 +1740,49 @@
                 ]
             }
         },
+        "/v1/eval/benchmarks": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": []
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/datasets": {
             "get": {
                 "responses": {
@@ -1503,49 +1826,6 @@
                 }
             }
         },
-        "/v1/eval-tasks": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ListEvalTasksResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "EvalTasks"
-                ],
-                "description": "",
-                "parameters": []
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "EvalTasks"
-                ],
-                "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RegisterEvalTaskRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/models": {
             "get": {
                 "responses": {
@@ -2121,7 +2401,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
             "post": {
                 "responses": {
                     "200": {
@@ -2141,7 +2421,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -2365,84 +2645,216 @@
     "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
     "components": {
         "schemas": {
-            "AppendRowsRequest": {
+            "AgentCandidate": {
                 "type": "object",
                 "properties": {
-                    "dataset_id": {
-                        "type": "string"
+                    "type": {
+                        "type": "string",
+                        "const": "agent",
+                        "default": "agent"
                     },
-                    "rows": {
+                    "config": {
+                        "$ref": "#/components/schemas/AgentConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "config"
+                ]
+            },
+            "AgentConfig": {
+                "type": "object",
+                "properties": {
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "input_shields": {
                         "type": "array",
                         "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
+                            "type": "string"
+                        }
+                    },
+                    "output_shields": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "toolgroups": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AgentTool"
+                        }
+                    },
+                    "client_tools": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolDef"
+                        }
+                    },
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "Prompt format for calling custom / zero shot tools."
+                    },
+                    "tool_config": {
+                        "$ref": "#/components/schemas/ToolConfig"
+                    },
+                    "max_infer_iters": {
+                        "type": "integer",
+                        "default": 10
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "instructions": {
+                        "type": "string"
+                    },
+                    "enable_session_persistence": {
+                        "type": "boolean"
+                    },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "model",
+                    "instructions",
+                    "enable_session_persistence"
+                ]
+            },
+            "AgentTool": {
+                "oneOf": [
+                    {
+                        "type": "string"
+                    },
+                    {
+                        "type": "object",
+                        "properties": {
+                            "name": {
+                                "type": "string"
+                            },
+                            "args": {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "oneOf": [
+                                        {
+                                            "type": "null"
+                                        },
+                                        {
+                                            "type": "boolean"
+                                        },
+                                        {
+                                            "type": "number"
+                                        },
+                                        {
+                                            "type": "string"
+                                        },
+                                        {
+                                            "type": "array"
+                                        },
+                                        {
+                                            "type": "object"
+                                        }
+                                    ]
+                                }
                             }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "name",
+                            "args"
+                        ]
+                    }
+                ]
+            },
+            "AggregationFunctionType": {
+                "type": "string",
+                "enum": [
+                    "average",
+                    "median",
+                    "categorical_count",
+                    "accuracy"
+                ]
+            },
+            "BasicScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "basic",
+                        "default": "basic"
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "dataset_id",
-                    "rows"
+                    "type"
                 ]
             },
-            "CompletionMessage": {
+            "BenchmarkConfig": {
                 "type": "object",
                 "properties": {
-                    "role": {
+                    "type": {
                         "type": "string",
-                        "const": "assistant",
-                        "default": "assistant",
-                        "description": "Must be \"assistant\" to identify this as the model's response"
+                        "const": "benchmark",
+                        "default": "benchmark"
                     },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content of the model's response"
+                    "eval_candidate": {
+                        "$ref": "#/components/schemas/EvalCandidate"
                     },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+                    "scoring_params": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringFnParams"
+                        }
                     },
-                    "tool_calls": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolCall"
-                        },
-                        "description": "List of tool calls. Each tool call is a ToolCall object."
+                    "num_examples": {
+                        "type": "integer"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "role",
-                    "content",
-                    "stop_reason"
+                    "type",
+                    "eval_candidate",
+                    "scoring_params"
+                ]
+            },
+            "EvalCandidate": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/ModelCandidate"
+                    },
+                    {
+                        "$ref": "#/components/schemas/AgentCandidate"
+                    }
                 ],
-                "description": "A message containing the model's (assistant) response in a chat conversation."
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "model": "#/components/schemas/ModelCandidate",
+                        "agent": "#/components/schemas/AgentCandidate"
+                    }
+                }
             },
             "GrammarResponseFormat": {
                 "type": "object",
@@ -2610,30 +3022,89 @@
                 ],
                 "description": "Configuration for JSON schema-guided response generation."
             },
-            "Message": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/UserMessage"
+            "LLMAsJudgeScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "llm_as_judge",
+                        "default": "llm_as_judge"
                     },
-                    {
+                    "judge_model": {
+                        "type": "string"
+                    },
+                    "prompt_template": {
+                        "type": "string"
+                    },
+                    "judge_score_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "judge_model"
+                ]
+            },
+            "ModelCandidate": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "model",
+                        "default": "model"
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "sampling_params": {
+                        "$ref": "#/components/schemas/SamplingParams"
+                    },
+                    "system_message": {
                         "$ref": "#/components/schemas/SystemMessage"
-                    },
-                    {
-                        "$ref": "#/components/schemas/ToolResponseMessage"
-                    },
-                    {
-                        "$ref": "#/components/schemas/CompletionMessage"
                     }
-                ],
-                "discriminator": {
-                    "propertyName": "role",
-                    "mapping": {
-                        "user": "#/components/schemas/UserMessage",
-                        "system": "#/components/schemas/SystemMessage",
-                        "tool": "#/components/schemas/ToolResponseMessage",
-                        "assistant": "#/components/schemas/CompletionMessage"
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "model",
+                    "sampling_params"
+                ]
+            },
+            "RegexParserScoringFnParams": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "regex_parser",
+                        "default": "regex_parser"
+                    },
+                    "parsing_regexes": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "aggregation_functions": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/AggregationFunctionType"
+                        }
                     }
-                }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ]
             },
             "ResponseFormat": {
                 "oneOf": [
@@ -2693,6 +3164,27 @@
                     }
                 }
             },
+            "ScoringFnParams": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
+                    },
+                    {
+                        "$ref": "#/components/schemas/BasicScoringFnParams"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
+                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
+                        "basic": "#/components/schemas/BasicScoringFnParams"
+                    }
+                }
+            },
             "SystemMessage": {
                 "type": "object",
                 "properties": {
@@ -2735,6 +3227,611 @@
                 ],
                 "description": "A text content item"
             },
+            "ToolConfig": {
+                "type": "object",
+                "properties": {
+                    "tool_choice": {
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
+                        "default": "auto"
+                    },
+                    "tool_prompt_format": {
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
+                    },
+                    "system_message_behavior": {
+                        "type": "string",
+                        "enum": [
+                            "append",
+                            "replace"
+                        ],
+                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
+                        "default": "append"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "system_message_behavior"
+                ],
+                "description": "Configuration for tool use."
+            },
+            "ToolDef": {
+                "type": "object",
+                "properties": {
+                    "name": {
+                        "type": "string"
+                    },
+                    "description": {
+                        "type": "string"
+                    },
+                    "parameters": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolParameter"
+                        }
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "name"
+                ]
+            },
+            "ToolParameter": {
+                "type": "object",
+                "properties": {
+                    "name": {
+                        "type": "string"
+                    },
+                    "parameter_type": {
+                        "type": "string"
+                    },
+                    "description": {
+                        "type": "string"
+                    },
+                    "required": {
+                        "type": "boolean",
+                        "default": true
+                    },
+                    "default": {
+                        "oneOf": [
+                            {
+                                "type": "null"
+                            },
+                            {
+                                "type": "boolean"
+                            },
+                            {
+                                "type": "number"
+                            },
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array"
+                            },
+                            {
+                                "type": "object"
+                            }
+                        ]
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "name",
+                    "parameter_type",
+                    "description",
+                    "required"
+                ]
+            },
+            "TopKSamplingStrategy": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "top_k",
+                        "default": "top_k"
+                    },
+                    "top_k": {
+                        "type": "integer"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "top_k"
+                ]
+            },
+            "TopPSamplingStrategy": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "top_p",
+                        "default": "top_p"
+                    },
+                    "temperature": {
+                        "type": "number"
+                    },
+                    "top_p": {
+                        "type": "number",
+                        "default": 0.95
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ]
+            },
+            "URL": {
+                "type": "object",
+                "properties": {
+                    "uri": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "uri"
+                ]
+            },
+            "DeprecatedEvaluateRowsRequest": {
+                "type": "object",
+                "properties": {
+                    "input_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_rows",
+                    "scoring_functions",
+                    "task_config"
+                ]
+            },
+            "EvaluateResponse": {
+                "type": "object",
+                "properties": {
+                    "generations": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "scores": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringResult"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "generations",
+                    "scores"
+                ]
+            },
+            "ScoringResult": {
+                "type": "object",
+                "properties": {
+                    "score_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "aggregated_results": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "score_rows",
+                    "aggregated_results"
+                ]
+            },
+            "Benchmark": {
+                "type": "object",
+                "properties": {
+                    "identifier": {
+                        "type": "string"
+                    },
+                    "provider_resource_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "benchmark",
+                        "default": "benchmark"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "identifier",
+                    "provider_resource_id",
+                    "provider_id",
+                    "type",
+                    "dataset_id",
+                    "scoring_functions",
+                    "metadata"
+                ]
+            },
+            "JobStatus": {
+                "type": "string",
+                "enum": [
+                    "completed",
+                    "in_progress",
+                    "failed",
+                    "scheduled"
+                ]
+            },
+            "ListBenchmarksResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Benchmark"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ]
+            },
+            "DeprecatedRegisterEvalTaskRequest": {
+                "type": "object",
+                "properties": {
+                    "eval_task_id": {
+                        "type": "string"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "provider_benchmark_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "eval_task_id",
+                    "dataset_id",
+                    "scoring_functions"
+                ]
+            },
+            "DeprecatedRunEvalRequest": {
+                "type": "object",
+                "properties": {
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task_config"
+                ]
+            },
+            "Job": {
+                "type": "object",
+                "properties": {
+                    "job_id": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "job_id"
+                ]
+            },
+            "AppendRowsRequest": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_id",
+                    "rows"
+                ]
+            },
+            "CompletionMessage": {
+                "type": "object",
+                "properties": {
+                    "role": {
+                        "type": "string",
+                        "const": "assistant",
+                        "default": "assistant",
+                        "description": "Must be \"assistant\" to identify this as the model's response"
+                    },
+                    "content": {
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the model's response"
+                    },
+                    "stop_reason": {
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
+                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
+                    },
+                    "tool_calls": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolCall"
+                        },
+                        "description": "List of tool calls. Each tool call is a ToolCall object."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "role",
+                    "content",
+                    "stop_reason"
+                ],
+                "description": "A message containing the model's (assistant) response in a chat conversation."
+            },
+            "Message": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/UserMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/SystemMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/ToolResponseMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/CompletionMessage"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "role",
+                    "mapping": {
+                        "user": "#/components/schemas/UserMessage",
+                        "system": "#/components/schemas/SystemMessage",
+                        "tool": "#/components/schemas/ToolResponseMessage",
+                        "assistant": "#/components/schemas/CompletionMessage"
+                    }
+                }
+            },
             "ToolCall": {
                 "type": "object",
                 "properties": {
@@ -2950,57 +4047,6 @@
                 ],
                 "description": "A message representing the result of a tool invocation."
             },
-            "TopKSamplingStrategy": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "top_k",
-                        "default": "top_k"
-                    },
-                    "top_k": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "top_k"
-                ]
-            },
-            "TopPSamplingStrategy": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "top_p",
-                        "default": "top_p"
-                    },
-                    "temperature": {
-                        "type": "number"
-                    },
-                    "top_p": {
-                        "type": "number",
-                        "default": 0.95
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ]
-            },
-            "URL": {
-                "type": "object",
-                "properties": {
-                    "uri": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "uri"
-                ]
-            },
             "UserMessage": {
                 "type": "object",
                 "properties": {
@@ -3309,43 +4355,6 @@
                     "job_uuid"
                 ]
             },
-            "ToolConfig": {
-                "type": "object",
-                "properties": {
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.",
-                        "default": "auto"
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
-                    },
-                    "system_message_behavior": {
-                        "type": "string",
-                        "enum": [
-                            "append",
-                            "replace"
-                        ],
-                        "description": "(Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string '{{function_definitions}}' to indicate where the function definitions should be inserted.",
-                        "default": "append"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "system_message_behavior"
-                ],
-                "description": "Configuration for tool use."
-            },
             "ChatCompletionRequest": {
                 "type": "object",
                 "properties": {
@@ -3644,218 +4653,6 @@
                 ],
                 "description": "A chunk of a streamed completion response."
             },
-            "AgentConfig": {
-                "type": "object",
-                "properties": {
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
-                    },
-                    "input_shields": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "output_shields": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "toolgroups": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AgentTool"
-                        }
-                    },
-                    "client_tools": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolDef"
-                        }
-                    },
-                    "tool_choice": {
-                        "type": "string",
-                        "enum": [
-                            "auto",
-                            "required"
-                        ],
-                        "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-                    },
-                    "tool_prompt_format": {
-                        "type": "string",
-                        "enum": [
-                            "json",
-                            "function_tag",
-                            "python_list"
-                        ],
-                        "description": "Prompt format for calling custom / zero shot tools."
-                    },
-                    "tool_config": {
-                        "$ref": "#/components/schemas/ToolConfig"
-                    },
-                    "max_infer_iters": {
-                        "type": "integer",
-                        "default": 10
-                    },
-                    "model": {
-                        "type": "string"
-                    },
-                    "instructions": {
-                        "type": "string"
-                    },
-                    "enable_session_persistence": {
-                        "type": "boolean"
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model",
-                    "instructions",
-                    "enable_session_persistence"
-                ]
-            },
-            "AgentTool": {
-                "oneOf": [
-                    {
-                        "type": "string"
-                    },
-                    {
-                        "type": "object",
-                        "properties": {
-                            "name": {
-                                "type": "string"
-                            },
-                            "args": {
-                                "type": "object",
-                                "additionalProperties": {
-                                    "oneOf": [
-                                        {
-                                            "type": "null"
-                                        },
-                                        {
-                                            "type": "boolean"
-                                        },
-                                        {
-                                            "type": "number"
-                                        },
-                                        {
-                                            "type": "string"
-                                        },
-                                        {
-                                            "type": "array"
-                                        },
-                                        {
-                                            "type": "object"
-                                        }
-                                    ]
-                                }
-                            }
-                        },
-                        "additionalProperties": false,
-                        "required": [
-                            "name",
-                            "args"
-                        ]
-                    }
-                ]
-            },
-            "ToolDef": {
-                "type": "object",
-                "properties": {
-                    "name": {
-                        "type": "string"
-                    },
-                    "description": {
-                        "type": "string"
-                    },
-                    "parameters": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolParameter"
-                        }
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "name"
-                ]
-            },
-            "ToolParameter": {
-                "type": "object",
-                "properties": {
-                    "name": {
-                        "type": "string"
-                    },
-                    "parameter_type": {
-                        "type": "string"
-                    },
-                    "description": {
-                        "type": "string"
-                    },
-                    "required": {
-                        "type": "boolean",
-                        "default": true
-                    },
-                    "default": {
-                        "oneOf": [
-                            {
-                                "type": "null"
-                            },
-                            {
-                                "type": "boolean"
-                            },
-                            {
-                                "type": "number"
-                            },
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "type": "array"
-                            },
-                            {
-                                "type": "object"
-                            }
-                        ]
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "name",
-                    "parameter_type",
-                    "description",
-                    "required"
-                ]
-            },
             "CreateAgentRequest": {
                 "type": "object",
                 "properties": {
@@ -4582,241 +5379,6 @@
                 ],
                 "description": "Response containing generated embeddings."
             },
-            "AgentCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "agent",
-                        "default": "agent"
-                    },
-                    "config": {
-                        "$ref": "#/components/schemas/AgentConfig"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "config"
-                ]
-            },
-            "AggregationFunctionType": {
-                "type": "string",
-                "enum": [
-                    "average",
-                    "median",
-                    "categorical_count",
-                    "accuracy"
-                ]
-            },
-            "AppEvalTaskConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "app",
-                        "default": "app"
-                    },
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        }
-                    },
-                    "num_examples": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "eval_candidate",
-                    "scoring_params"
-                ]
-            },
-            "BasicScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "basic",
-                        "default": "basic"
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ]
-            },
-            "BenchmarkEvalTaskConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "benchmark",
-                        "default": "benchmark"
-                    },
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
-                    },
-                    "num_examples": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "eval_candidate"
-                ]
-            },
-            "EvalCandidate": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/ModelCandidate"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AgentCandidate"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "model": "#/components/schemas/ModelCandidate",
-                        "agent": "#/components/schemas/AgentCandidate"
-                    }
-                }
-            },
-            "EvalTaskConfig": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/BenchmarkEvalTaskConfig"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AppEvalTaskConfig"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "benchmark": "#/components/schemas/BenchmarkEvalTaskConfig",
-                        "app": "#/components/schemas/AppEvalTaskConfig"
-                    }
-                }
-            },
-            "LLMAsJudgeScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "llm_as_judge",
-                        "default": "llm_as_judge"
-                    },
-                    "judge_model": {
-                        "type": "string"
-                    },
-                    "prompt_template": {
-                        "type": "string"
-                    },
-                    "judge_score_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "judge_model"
-                ]
-            },
-            "ModelCandidate": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "model",
-                        "default": "model"
-                    },
-                    "model": {
-                        "type": "string"
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
-                    },
-                    "system_message": {
-                        "$ref": "#/components/schemas/SystemMessage"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "model",
-                    "sampling_params"
-                ]
-            },
-            "RegexParserScoringFnParams": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "regex_parser",
-                        "default": "regex_parser"
-                    },
-                    "parsing_regexes": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "aggregation_functions": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/AggregationFunctionType"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type"
-                ]
-            },
-            "ScoringFnParams": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/RegexParserScoringFnParams"
-                    },
-                    {
-                        "$ref": "#/components/schemas/BasicScoringFnParams"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams",
-                        "regex_parser": "#/components/schemas/RegexParserScoringFnParams",
-                        "basic": "#/components/schemas/BasicScoringFnParams"
-                    }
-                }
-            },
             "EvaluateRowsRequest": {
                 "type": "object",
                 "properties": {
@@ -4855,7 +5417,7 @@
                         }
                     },
                     "task_config": {
-                        "$ref": "#/components/schemas/EvalTaskConfig"
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
@@ -4865,113 +5427,6 @@
                     "task_config"
                 ]
             },
-            "EvaluateResponse": {
-                "type": "object",
-                "properties": {
-                    "generations": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        }
-                    },
-                    "scores": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringResult"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "generations",
-                    "scores"
-                ]
-            },
-            "ScoringResult": {
-                "type": "object",
-                "properties": {
-                    "score_rows": {
-                        "type": "array",
-                        "items": {
-                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
-                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
-                        }
-                    },
-                    "aggregated_results": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "score_rows",
-                    "aggregated_results"
-                ]
-            },
             "Session": {
                 "type": "object",
                 "properties": {
@@ -5287,69 +5742,6 @@
                     "type"
                 ]
             },
-            "EvalTask": {
-                "type": "object",
-                "properties": {
-                    "identifier": {
-                        "type": "string"
-                    },
-                    "provider_resource_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "eval_task",
-                        "default": "eval_task"
-                    },
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "identifier",
-                    "provider_resource_id",
-                    "provider_id",
-                    "type",
-                    "dataset_id",
-                    "scoring_functions",
-                    "metadata"
-                ]
-            },
             "Model": {
                 "type": "object",
                 "properties": {
@@ -5891,15 +6283,6 @@
                 ],
                 "description": "Artifacts of a finetuning job."
             },
-            "JobStatus": {
-                "type": "string",
-                "enum": [
-                    "completed",
-                    "in_progress",
-                    "failed",
-                    "scheduled"
-                ]
-            },
             "PostTrainingJobStatusResponse": {
                 "type": "object",
                 "properties": {
@@ -6243,21 +6626,6 @@
                     "data"
                 ]
             },
-            "ListEvalTasksResponse": {
-                "type": "object",
-                "properties": {
-                    "data": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/EvalTask"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "data"
-                ]
-            },
             "ListModelsResponse": {
                 "type": "object",
                 "properties": {
@@ -7169,6 +7537,60 @@
                     "data"
                 ]
             },
+            "RegisterBenchmarkRequest": {
+                "type": "object",
+                "properties": {
+                    "benchmark_id": {
+                        "type": "string"
+                    },
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "provider_benchmark_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "benchmark_id",
+                    "dataset_id",
+                    "scoring_functions"
+                ]
+            },
             "RegisterDatasetRequest": {
                 "type": "object",
                 "properties": {
@@ -7223,60 +7645,6 @@
                     "url"
                 ]
             },
-            "RegisterEvalTaskRequest": {
-                "type": "object",
-                "properties": {
-                    "eval_task_id": {
-                        "type": "string"
-                    },
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "scoring_functions": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        }
-                    },
-                    "provider_eval_task_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "eval_task_id",
-                    "dataset_id",
-                    "scoring_functions"
-                ]
-            },
             "RegisterModelRequest": {
                 "type": "object",
                 "properties": {
@@ -7468,7 +7836,7 @@
                 "type": "object",
                 "properties": {
                     "task_config": {
-                        "$ref": "#/components/schemas/EvalTaskConfig"
+                        "$ref": "#/components/schemas/BenchmarkConfig"
                     }
                 },
                 "additionalProperties": false,
@@ -7476,18 +7844,6 @@
                     "task_config"
                 ]
             },
-            "Job": {
-                "type": "object",
-                "properties": {
-                    "job_id": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_id"
-                ]
-            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -7970,6 +8326,9 @@
         {
             "name": "BatchInference (Coming Soon)"
         },
+        {
+            "name": "Benchmarks"
+        },
         {
             "name": "DatasetIO"
         },
@@ -7979,9 +8338,6 @@
         {
             "name": "Eval"
         },
-        {
-            "name": "EvalTasks"
-        },
         {
             "name": "Inference",
             "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -8033,10 +8389,10 @@
             "tags": [
                 "Agents",
                 "BatchInference (Coming Soon)",
+                "Benchmarks",
                 "DatasetIO",
                 "Datasets",
                 "Eval",
-                "EvalTasks",
                 "Inference",
                 "Inspect",
                 "Models",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a646d7e08..b30025020 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -10,6 +10,175 @@ info:
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/eval/tasks/{task_id}/evaluations:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedEvaluateRowsRequest'
+        required: true
+      deprecated: true
+  /v1/eval-tasks/{task_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/Benchmark'
+                  - type: 'null'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters:
+        - name: eval_task_id
+          in: query
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/JobStatus'
+                  - type: 'null'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+    delete:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval-tasks:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBenchmarksResponse'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+      deprecated: true
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
+        required: true
+      deprecated: true
+  /v1/eval/tasks/{task_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/DeprecatedRunEvalRequest'
+        required: true
+      deprecated: true
   /v1/datasetio/rows:
     get:
       responses:
@@ -322,7 +491,7 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/tasks/{task_id}/evaluations:
+  /v1/eval/benchmarks/{benchmark_id}/evaluations:
     post:
       responses:
         '200':
@@ -335,7 +504,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -407,6 +576,26 @@ paths:
           required: true
           schema:
             type: string
+  /v1/eval/benchmarks/{benchmark_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/Benchmark'
+                  - type: 'null'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters:
+        - name: benchmark_id
+          in: path
+          required: true
+          schema:
+            type: string
   /v1/datasets/{dataset_id}:
     get:
       responses:
@@ -440,26 +629,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval-tasks/{eval_task_id}:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/EvalTask'
-                  - type: 'null'
-      tags:
-        - EvalTasks
-      description: ''
-      parameters:
-        - name: eval_task_id
-          in: path
-          required: true
-          schema:
-            type: string
   /v1/models/{model_id}:
     get:
       responses:
@@ -802,7 +971,7 @@ paths:
             schema:
               $ref: '#/components/schemas/InvokeToolRequest'
         required: true
-  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
     get:
       responses:
         '200':
@@ -817,7 +986,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -835,7 +1004,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -845,7 +1014,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
     get:
       responses:
         '200':
@@ -858,16 +1027,43 @@ paths:
         - Eval
       description: ''
       parameters:
+        - name: benchmark_id
+          in: path
+          required: true
+          schema:
+            type: string
         - name: job_id
           in: path
           required: true
           schema:
             type: string
-        - name: task_id
-          in: path
-          required: true
-          schema:
-            type: string
+  /v1/eval/benchmarks:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBenchmarksResponse'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterBenchmarkRequest'
+        required: true
   /v1/datasets:
     get:
       responses:
@@ -895,33 +1091,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RegisterDatasetRequest'
         required: true
-  /v1/eval-tasks:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListEvalTasksResponse'
-      tags:
-        - EvalTasks
-      description: ''
-      parameters: []
-    post:
-      responses:
-        '200':
-          description: OK
-      tags:
-        - EvalTasks
-      description: ''
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterEvalTaskRequest'
-        required: true
   /v1/models:
     get:
       responses:
@@ -1278,7 +1447,7 @@ paths:
             type: array
             items:
               type: string
-  /v1/eval/tasks/{task_id}/jobs:
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
     post:
       responses:
         '200':
@@ -1291,7 +1460,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -1429,65 +1598,146 @@ jsonSchemaDialect: >-
   https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
-    AppendRowsRequest:
+    AgentCandidate:
       type: object
       properties:
-        dataset_id:
+        type:
           type: string
-        rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
+          const: agent
+          default: agent
+        config:
+          $ref: '#/components/schemas/AgentConfig'
       additionalProperties: false
       required:
-        - dataset_id
-        - rows
-    CompletionMessage:
+        - type
+        - config
+    AgentConfig:
       type: object
       properties:
-        role:
-          type: string
-          const: assistant
-          default: assistant
-          description: >-
-            Must be "assistant" to identify this as the model's response
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: The content of the model's response
-        stop_reason:
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        input_shields:
+          type: array
+          items:
+            type: string
+        output_shields:
+          type: array
+          items:
+            type: string
+        toolgroups:
+          type: array
+          items:
+            $ref: '#/components/schemas/AgentTool'
+        client_tools:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolDef'
+        tool_choice:
           type: string
           enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
+            - auto
+            - required
           description: >-
-            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
-            The model finished generating the entire response. - `StopReason.end_of_message`:
-            The model finished generating but generated a partial response -- usually,
-            a tool call. The user may call the tool and continue the conversation
-            with the tool's response. - `StopReason.out_of_tokens`: The model ran
-            out of token budget.
-        tool_calls:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolCall'
+            Whether tool use is required or automatic. This is a hint to the model
+            which may not be followed. It depends on the Instruction Following capabilities
+            of the model.
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
           description: >-
-            List of tool calls. Each tool call is a ToolCall object.
+            Prompt format for calling custom / zero shot tools.
+        tool_config:
+          $ref: '#/components/schemas/ToolConfig'
+        max_infer_iters:
+          type: integer
+          default: 10
+        model:
+          type: string
+        instructions:
+          type: string
+        enable_session_persistence:
+          type: boolean
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
       additionalProperties: false
       required:
-        - role
-        - content
-        - stop_reason
-      description: >-
-        A message containing the model's (assistant) response in a chat conversation.
+        - model
+        - instructions
+        - enable_session_persistence
+    AgentTool:
+      oneOf:
+        - type: string
+        - type: object
+          properties:
+            name:
+              type: string
+            args:
+              type: object
+              additionalProperties:
+                oneOf:
+                  - type: 'null'
+                  - type: boolean
+                  - type: number
+                  - type: string
+                  - type: array
+                  - type: object
+          additionalProperties: false
+          required:
+            - name
+            - args
+    AggregationFunctionType:
+      type: string
+      enum:
+        - average
+        - median
+        - categorical_count
+        - accuracy
+    BasicScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: basic
+          default: basic
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+    BenchmarkConfig:
+      type: object
+      properties:
+        type:
+          type: string
+          const: benchmark
+          default: benchmark
+        eval_candidate:
+          $ref: '#/components/schemas/EvalCandidate'
+        scoring_params:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringFnParams'
+        num_examples:
+          type: integer
+      additionalProperties: false
+      required:
+        - type
+        - eval_candidate
+        - scoring_params
+    EvalCandidate:
+      oneOf:
+        - $ref: '#/components/schemas/ModelCandidate'
+        - $ref: '#/components/schemas/AgentCandidate'
+      discriminator:
+        propertyName: type
+        mapping:
+          model: '#/components/schemas/ModelCandidate'
+          agent: '#/components/schemas/AgentCandidate'
     GrammarResponseFormat:
       type: object
       properties:
@@ -1598,19 +1848,65 @@ components:
         - json_schema
       description: >-
         Configuration for JSON schema-guided response generation.
-    Message:
-      oneOf:
-        - $ref: '#/components/schemas/UserMessage'
-        - $ref: '#/components/schemas/SystemMessage'
-        - $ref: '#/components/schemas/ToolResponseMessage'
-        - $ref: '#/components/schemas/CompletionMessage'
-      discriminator:
-        propertyName: role
-        mapping:
-          user: '#/components/schemas/UserMessage'
-          system: '#/components/schemas/SystemMessage'
-          tool: '#/components/schemas/ToolResponseMessage'
-          assistant: '#/components/schemas/CompletionMessage'
+    LLMAsJudgeScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: llm_as_judge
+          default: llm_as_judge
+        judge_model:
+          type: string
+        prompt_template:
+          type: string
+        judge_score_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
+        - judge_model
+    ModelCandidate:
+      type: object
+      properties:
+        type:
+          type: string
+          const: model
+          default: model
+        model:
+          type: string
+        sampling_params:
+          $ref: '#/components/schemas/SamplingParams'
+        system_message:
+          $ref: '#/components/schemas/SystemMessage'
+      additionalProperties: false
+      required:
+        - type
+        - model
+        - sampling_params
+    RegexParserScoringFnParams:
+      type: object
+      properties:
+        type:
+          type: string
+          const: regex_parser
+          default: regex_parser
+        parsing_regexes:
+          type: array
+          items:
+            type: string
+        aggregation_functions:
+          type: array
+          items:
+            $ref: '#/components/schemas/AggregationFunctionType'
+      additionalProperties: false
+      required:
+        - type
     ResponseFormat:
       oneOf:
         - $ref: '#/components/schemas/JsonSchemaResponseFormat'
@@ -1645,6 +1941,17 @@ components:
           greedy: '#/components/schemas/GreedySamplingStrategy'
           top_p: '#/components/schemas/TopPSamplingStrategy'
           top_k: '#/components/schemas/TopKSamplingStrategy'
+    ScoringFnParams:
+      oneOf:
+        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+        - $ref: '#/components/schemas/RegexParserScoringFnParams'
+        - $ref: '#/components/schemas/BasicScoringFnParams'
+      discriminator:
+        propertyName: type
+        mapping:
+          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+          basic: '#/components/schemas/BasicScoringFnParams'
     SystemMessage:
       type: object
       properties:
@@ -1683,6 +1990,383 @@ components:
         - type
         - text
       description: A text content item
+    ToolConfig:
+      type: object
+      properties:
+        tool_choice:
+          type: string
+          enum:
+            - auto
+            - required
+          description: >-
+            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+          default: auto
+        tool_prompt_format:
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          description: >-
+            (Optional) Instructs the model how to format tool calls. By default, Llama
+            Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
+            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+            syntax -- a list of function calls.
+        system_message_behavior:
+          type: string
+          enum:
+            - append
+            - replace
+          description: >-
+            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
+            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
+            Replaces the default system prompt with the provided system message. The
+            system message can include the string '{{function_definitions}}' to indicate
+            where the function definitions should be inserted.
+          default: append
+      additionalProperties: false
+      required:
+        - system_message_behavior
+      description: Configuration for tool use.
+    ToolDef:
+      type: object
+      properties:
+        name:
+          type: string
+        description:
+          type: string
+        parameters:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolParameter'
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - name
+    ToolParameter:
+      type: object
+      properties:
+        name:
+          type: string
+        parameter_type:
+          type: string
+        description:
+          type: string
+        required:
+          type: boolean
+          default: true
+        default:
+          oneOf:
+            - type: 'null'
+            - type: boolean
+            - type: number
+            - type: string
+            - type: array
+            - type: object
+      additionalProperties: false
+      required:
+        - name
+        - parameter_type
+        - description
+        - required
+    TopKSamplingStrategy:
+      type: object
+      properties:
+        type:
+          type: string
+          const: top_k
+          default: top_k
+        top_k:
+          type: integer
+      additionalProperties: false
+      required:
+        - type
+        - top_k
+    TopPSamplingStrategy:
+      type: object
+      properties:
+        type:
+          type: string
+          const: top_p
+          default: top_p
+        temperature:
+          type: number
+        top_p:
+          type: number
+          default: 0.95
+      additionalProperties: false
+      required:
+        - type
+    URL:
+      type: object
+      properties:
+        uri:
+          type: string
+      additionalProperties: false
+      required:
+        - uri
+    DeprecatedEvaluateRowsRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+        - task_config
+    EvaluateResponse:
+      type: object
+      properties:
+        generations:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scores:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringResult'
+      additionalProperties: false
+      required:
+        - generations
+        - scores
+    ScoringResult:
+      type: object
+      properties:
+        score_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        aggregated_results:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - score_rows
+        - aggregated_results
+    Benchmark:
+      type: object
+      properties:
+        identifier:
+          type: string
+        provider_resource_id:
+          type: string
+        provider_id:
+          type: string
+        type:
+          type: string
+          const: benchmark
+          default: benchmark
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - identifier
+        - provider_resource_id
+        - provider_id
+        - type
+        - dataset_id
+        - scoring_functions
+        - metadata
+    JobStatus:
+      type: string
+      enum:
+        - completed
+        - in_progress
+        - failed
+        - scheduled
+    ListBenchmarksResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Benchmark'
+      additionalProperties: false
+      required:
+        - data
+    DeprecatedRegisterEvalTaskRequest:
+      type: object
+      properties:
+        eval_task_id:
+          type: string
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        provider_benchmark_id:
+          type: string
+        provider_id:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - eval_task_id
+        - dataset_id
+        - scoring_functions
+    DeprecatedRunEvalRequest:
+      type: object
+      properties:
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - task_config
+    Job:
+      type: object
+      properties:
+        job_id:
+          type: string
+      additionalProperties: false
+      required:
+        - job_id
+    AppendRowsRequest:
+      type: object
+      properties:
+        dataset_id:
+          type: string
+        rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+      additionalProperties: false
+      required:
+        - dataset_id
+        - rows
+    CompletionMessage:
+      type: object
+      properties:
+        role:
+          type: string
+          const: assistant
+          default: assistant
+          description: >-
+            Must be "assistant" to identify this as the model's response
+        content:
+          $ref: '#/components/schemas/InterleavedContent'
+          description: The content of the model's response
+        stop_reason:
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
+          description: >-
+            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+            The model finished generating the entire response. - `StopReason.end_of_message`:
+            The model finished generating but generated a partial response -- usually,
+            a tool call. The user may call the tool and continue the conversation
+            with the tool's response. - `StopReason.out_of_tokens`: The model ran
+            out of token budget.
+        tool_calls:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolCall'
+          description: >-
+            List of tool calls. Each tool call is a ToolCall object.
+      additionalProperties: false
+      required:
+        - role
+        - content
+        - stop_reason
+      description: >-
+        A message containing the model's (assistant) response in a chat conversation.
+    Message:
+      oneOf:
+        - $ref: '#/components/schemas/UserMessage'
+        - $ref: '#/components/schemas/SystemMessage'
+        - $ref: '#/components/schemas/ToolResponseMessage'
+        - $ref: '#/components/schemas/CompletionMessage'
+      discriminator:
+        propertyName: role
+        mapping:
+          user: '#/components/schemas/UserMessage'
+          system: '#/components/schemas/SystemMessage'
+          tool: '#/components/schemas/ToolResponseMessage'
+          assistant: '#/components/schemas/CompletionMessage'
     ToolCall:
       type: object
       properties:
@@ -1803,42 +2487,6 @@ components:
         - content
       description: >-
         A message representing the result of a tool invocation.
-    TopKSamplingStrategy:
-      type: object
-      properties:
-        type:
-          type: string
-          const: top_k
-          default: top_k
-        top_k:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - top_k
-    TopPSamplingStrategy:
-      type: object
-      properties:
-        type:
-          type: string
-          const: top_p
-          default: top_p
-        temperature:
-          type: number
-        top_p:
-          type: number
-          default: 0.95
-      additionalProperties: false
-      required:
-        - type
-    URL:
-      type: object
-      properties:
-        uri:
-          type: string
-      additionalProperties: false
-      required:
-        - uri
     UserMessage:
       type: object
       properties:
@@ -2063,46 +2711,6 @@ components:
       additionalProperties: false
       required:
         - job_uuid
-    ToolConfig:
-      type: object
-      properties:
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-          description: >-
-            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
-          default: auto
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            (Optional) Instructs the model how to format tool calls. By default, Llama
-            Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
-            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
-            syntax -- a list of function calls.
-        system_message_behavior:
-          type: string
-          enum:
-            - append
-            - replace
-          description: >-
-            (Optional) Config for how to override the default system prompt. - `SystemMessageBehavior.append`:
-            Appends the provided system message to the default system prompt. - `SystemMessageBehavior.replace`:
-            Replaces the default system prompt with the provided system message. The
-            system message can include the string '{{function_definitions}}' to indicate
-            where the function definitions should be inserted.
-          default: append
-      additionalProperties: false
-      required:
-        - system_message_behavior
-      description: Configuration for tool use.
     ChatCompletionRequest:
       type: object
       properties:
@@ -2356,133 +2964,6 @@ components:
         - delta
       description: >-
         A chunk of a streamed completion response.
-    AgentConfig:
-      type: object
-      properties:
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        input_shields:
-          type: array
-          items:
-            type: string
-        output_shields:
-          type: array
-          items:
-            type: string
-        toolgroups:
-          type: array
-          items:
-            $ref: '#/components/schemas/AgentTool'
-        client_tools:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolDef'
-        tool_choice:
-          type: string
-          enum:
-            - auto
-            - required
-          description: >-
-            Whether tool use is required or automatic. This is a hint to the model
-            which may not be followed. It depends on the Instruction Following capabilities
-            of the model.
-        tool_prompt_format:
-          type: string
-          enum:
-            - json
-            - function_tag
-            - python_list
-          description: >-
-            Prompt format for calling custom / zero shot tools.
-        tool_config:
-          $ref: '#/components/schemas/ToolConfig'
-        max_infer_iters:
-          type: integer
-          default: 10
-        model:
-          type: string
-        instructions:
-          type: string
-        enable_session_persistence:
-          type: boolean
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-      additionalProperties: false
-      required:
-        - model
-        - instructions
-        - enable_session_persistence
-    AgentTool:
-      oneOf:
-        - type: string
-        - type: object
-          properties:
-            name:
-              type: string
-            args:
-              type: object
-              additionalProperties:
-                oneOf:
-                  - type: 'null'
-                  - type: boolean
-                  - type: number
-                  - type: string
-                  - type: array
-                  - type: object
-          additionalProperties: false
-          required:
-            - name
-            - args
-    ToolDef:
-      type: object
-      properties:
-        name:
-          type: string
-        description:
-          type: string
-        parameters:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolParameter'
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - name
-    ToolParameter:
-      type: object
-      properties:
-        name:
-          type: string
-        parameter_type:
-          type: string
-        description:
-          type: string
-        required:
-          type: boolean
-          default: true
-        default:
-          oneOf:
-            - type: 'null'
-            - type: boolean
-            - type: number
-            - type: string
-            - type: array
-            - type: object
-      additionalProperties: false
-      required:
-        - name
-        - parameter_type
-        - description
-        - required
     CreateAgentRequest:
       type: object
       properties:
@@ -2962,163 +3443,6 @@ components:
         - embeddings
       description: >-
         Response containing generated embeddings.
-    AgentCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: agent
-          default: agent
-        config:
-          $ref: '#/components/schemas/AgentConfig'
-      additionalProperties: false
-      required:
-        - type
-        - config
-    AggregationFunctionType:
-      type: string
-      enum:
-        - average
-        - median
-        - categorical_count
-        - accuracy
-    AppEvalTaskConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: app
-          default: app
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-        num_examples:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - eval_candidate
-        - scoring_params
-    BasicScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: basic
-          default: basic
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-    BenchmarkEvalTaskConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: benchmark
-          default: benchmark
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        num_examples:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - eval_candidate
-    EvalCandidate:
-      oneOf:
-        - $ref: '#/components/schemas/ModelCandidate'
-        - $ref: '#/components/schemas/AgentCandidate'
-      discriminator:
-        propertyName: type
-        mapping:
-          model: '#/components/schemas/ModelCandidate'
-          agent: '#/components/schemas/AgentCandidate'
-    EvalTaskConfig:
-      oneOf:
-        - $ref: '#/components/schemas/BenchmarkEvalTaskConfig'
-        - $ref: '#/components/schemas/AppEvalTaskConfig'
-      discriminator:
-        propertyName: type
-        mapping:
-          benchmark: '#/components/schemas/BenchmarkEvalTaskConfig'
-          app: '#/components/schemas/AppEvalTaskConfig'
-    LLMAsJudgeScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
-          default: llm_as_judge
-        judge_model:
-          type: string
-        prompt_template:
-          type: string
-        judge_score_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-        - judge_model
-    ModelCandidate:
-      type: object
-      properties:
-        type:
-          type: string
-          const: model
-          default: model
-        model:
-          type: string
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-        system_message:
-          $ref: '#/components/schemas/SystemMessage'
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - sampling_params
-    RegexParserScoringFnParams:
-      type: object
-      properties:
-        type:
-          type: string
-          const: regex_parser
-          default: regex_parser
-        parsing_regexes:
-          type: array
-          items:
-            type: string
-        aggregation_functions:
-          type: array
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-      additionalProperties: false
-      required:
-        - type
-    ScoringFnParams:
-      oneOf:
-        - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        - $ref: '#/components/schemas/BasicScoringFnParams'
-      discriminator:
-        propertyName: type
-        mapping:
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-          basic: '#/components/schemas/BasicScoringFnParams'
     EvaluateRowsRequest:
       type: object
       properties:
@@ -3139,64 +3463,12 @@ components:
           items:
             type: string
         task_config:
-          $ref: '#/components/schemas/EvalTaskConfig'
+          $ref: '#/components/schemas/BenchmarkConfig'
       additionalProperties: false
       required:
         - input_rows
         - scoring_functions
         - task_config
-    EvaluateResponse:
-      type: object
-      properties:
-        generations:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-        scores:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-      additionalProperties: false
-      required:
-        - generations
-        - scores
-    ScoringResult:
-      type: object
-      properties:
-        score_rows:
-          type: array
-          items:
-            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
-        aggregated_results:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - score_rows
-        - aggregated_results
     Session:
       type: object
       properties:
@@ -3401,44 +3673,6 @@ components:
       additionalProperties: false
       required:
         - type
-    EvalTask:
-      type: object
-      properties:
-        identifier:
-          type: string
-        provider_resource_id:
-          type: string
-        provider_id:
-          type: string
-        type:
-          type: string
-          const: eval_task
-          default: eval_task
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - identifier
-        - provider_resource_id
-        - provider_id
-        - type
-        - dataset_id
-        - scoring_functions
-        - metadata
     Model:
       type: object
       properties:
@@ -3766,13 +4000,6 @@ components:
         - job_uuid
         - checkpoints
       description: Artifacts of a finetuning job.
-    JobStatus:
-      type: string
-      enum:
-        - completed
-        - in_progress
-        - failed
-        - scheduled
     PostTrainingJobStatusResponse:
       type: object
       properties:
@@ -3977,16 +4204,6 @@ components:
       additionalProperties: false
       required:
         - data
-    ListEvalTasksResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/EvalTask'
-      additionalProperties: false
-      required:
-        - data
     ListModelsResponse:
       type: object
       properties:
@@ -4569,6 +4786,36 @@ components:
       additionalProperties: false
       required:
         - data
+    RegisterBenchmarkRequest:
+      type: object
+      properties:
+        benchmark_id:
+          type: string
+        dataset_id:
+          type: string
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        provider_benchmark_id:
+          type: string
+        provider_id:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - benchmark_id
+        - dataset_id
+        - scoring_functions
     RegisterDatasetRequest:
       type: object
       properties:
@@ -4599,36 +4846,6 @@ components:
         - dataset_id
         - dataset_schema
         - url
-    RegisterEvalTaskRequest:
-      type: object
-      properties:
-        eval_task_id:
-          type: string
-        dataset_id:
-          type: string
-        scoring_functions:
-          type: array
-          items:
-            type: string
-        provider_eval_task_id:
-          type: string
-        provider_id:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - eval_task_id
-        - dataset_id
-        - scoring_functions
     RegisterModelRequest:
       type: object
       properties:
@@ -4739,18 +4956,10 @@ components:
       type: object
       properties:
         task_config:
-          $ref: '#/components/schemas/EvalTaskConfig'
+          $ref: '#/components/schemas/BenchmarkConfig'
       additionalProperties: false
       required:
         - task_config
-    Job:
-      type: object
-      properties:
-        job_id:
-          type: string
-      additionalProperties: false
-      required:
-        - job_id
     RunShieldRequest:
       type: object
       properties:
@@ -5049,10 +5258,10 @@ tags:
     x-displayName: >-
       Agents API for creating and interacting with agentic systems.
   - name: BatchInference (Coming Soon)
+  - name: Benchmarks
   - name: DatasetIO
   - name: Datasets
   - name: Eval
-  - name: EvalTasks
   - name: Inference
     description: >-
       This API provides the raw interface to the underlying models. Two kinds of models
@@ -5083,10 +5292,10 @@ x-tagGroups:
     tags:
       - Agents
       - BatchInference (Coming Soon)
+      - Benchmarks
       - DatasetIO
       - Datasets
       - Eval
-      - EvalTasks
       - Inference
       - Inspect
       - Models
diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index abe537c8e..ee616b471 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -324,7 +324,7 @@
               "- vector_io\n",
               "container_image: null\n",
               "datasets: <span style=\"font-weight: bold\">[]</span>\n",
-              "eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
+              "benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
               "image_name: together\n",
               "metadata_store:\n",
               "  db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/ashwin/.llama/distributions/together/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
@@ -508,7 +508,7 @@
               "- vector_io\n",
               "container_image: null\n",
               "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
-              "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+              "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "image_name: together\n",
               "metadata_store:\n",
               "  db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 84da25246..8eecf84ab 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -370,7 +370,7 @@
               "- tool_runtime\n",
               "datasets: <span style=\"font-weight: bold\">[]</span>\n",
               "container_image: null\n",
-              "eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
+              "benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
               "image_name: together\n",
               "memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
               "metadata_store:\n",
@@ -551,7 +551,7 @@
               "- tool_runtime\n",
               "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "container_image: null\n",
-              "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+              "benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "image_name: together\n",
               "memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
               "metadata_store:\n",
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index a0385cae0..0f3b99784 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -647,6 +647,7 @@ class Generator:
         description = "\n".join(
             filter(None, [doc_string.short_description, doc_string.long_description])
         )
+
         return Operation(
             tags=[op.defining_class.__name__],
             summary=None,
@@ -656,6 +657,7 @@ class Generator:
             requestBody=requestBody,
             responses=responses,
             callbacks=callbacks,
+            deprecated=True if "DEPRECATED" in op.func_name else None,
             security=[] if op.public else None,
         )
 
diff --git a/docs/openapi_generator/pyopenapi/specification.py b/docs/openapi_generator/pyopenapi/specification.py
index 4b54295c5..f96de58b6 100644
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@@ -117,6 +117,7 @@ class Operation:
     requestBody: Optional[RequestBody] = None
     callbacks: Optional[Dict[str, "Callback"]] = None
     security: Optional[List["SecurityRequirement"]] = None
+    deprecated: Optional[bool] = None
 
 
 @dataclass
diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index c4cb476e4..f28e0d5fd 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -41,14 +41,14 @@ system_message = {
     "content": SYSTEM_PROMPT_TEMPLATE,
 }
 
-client.eval_tasks.register(
-    eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+    benchmark_id="meta-reference::mmmu",
     dataset_id=f"mmmu-{subset}-{split}",
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated(
 ```
 
 ```python
-client.eval_tasks.register(
-    eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+    benchmark_id="meta-reference::simpleqa",
     dataset_id=simpleqa_dataset_id,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -156,7 +156,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md
index 91e5c552b..ad220f751 100644
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@@ -10,15 +10,15 @@ Here's how to set up basic evaluation:
 
 ```python
 # Create an evaluation task
-response = client.eval_tasks.register(
-    eval_task_id="my_eval",
+response = client.benchmarks.register(
+    benchmark_id="my_eval",
     dataset_id="my_dataset",
     scoring_functions=["accuracy", "relevance"],
 )
 
 # Run evaluation
 job = client.eval.run_eval(
-    task_id="my_eval",
+    benchmark_id="my_eval",
     task_config={
         "type": "app",
         "eval_candidate": {"type": "agent", "config": agent_config},
@@ -26,5 +26,5 @@ job = client.eval.run_eval(
 )
 
 # Get results
-result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
+result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
 ```
diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md
index 399d99d92..3ca4b0ac8 100644
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
 
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
 - **Scoring**: evaluate outputs of the system.
   - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
-  - Associated with `EvalTask` resource.
+  - Associated with `Benchmark` resource.
 
 
 Use the following decision tree to decide how to use LlamaStack Evaluation flow.
diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md
index 1437ec623..403e47c48 100644
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
 - **Tool Runtime** is associated with `ToolGroup` resources.
 - **DatasetIO** is associated with `Dataset` resources.
 - **Scoring** is associated with `ScoringFunction` resources.
-- **Eval** is associated with `Model` and `EvalTask` resources.
+- **Eval** is associated with `Model` and `Benchmark` resources.
 
 Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
 
diff --git a/docs/source/playground/index.md b/docs/source/playground/index.md
index d74bf1a03..9691609ab 100644
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
     ```
 
     ```bash
-    $ llama-stack-client eval_tasks register \
+    $ llama-stack-client benchmarks register \
     --eval-task-id meta-reference-mmlu \
     --provider-id meta-reference \
     --dataset-id mmlu \
@@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
   - Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
 
 - **API Resources**: Inspect Llama Stack API resources
-  - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`).
+  - This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`).
   - Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
   - Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
 
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index 86f66208a..71dbb47e5 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
-- `/eval` + `/eval_tasks` API
+- `/eval` + `/benchmarks` API
 
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 
@@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
 - **Scoring**: evaluate outputs of the system.
   - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
-  - Associated with `EvalTask` resource.
+  - Associated with `Benchmark` resource.
 
 
 Use the following decision tree to decide how to use LlamaStack Evaluation flow.
@@ -77,14 +77,14 @@ system_message = {
     "content": SYSTEM_PROMPT_TEMPLATE,
 }
 
-client.eval_tasks.register(
-    eval_task_id="meta-reference::mmmu",
+client.benchmarks.register(
+    benchmark_id="meta-reference::mmmu",
     dataset_id=f"mmmu-{subset}-{split}",
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated(
 ```
 
 ```python
-client.eval_tasks.register(
-    eval_task_id="meta-reference::simpleqa",
+client.benchmarks.register(
+    benchmark_id="meta-reference::simpleqa",
     dataset_id=simpleqa_dataset_id,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -192,7 +192,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t
 
 #### Benchmark Evaluation CLI
 Usage: There are 2 inputs necessary for running a benchmark eval
-- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by
+- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
   - `dataset_id`: the identifier associated with the dataset.
   - `List[scoring_function_id]`: list of scoring function identifiers.
 - `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
@@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval
 
 ```
 llama-stack-client eval run_benchmark <eval-task-id> \
---eval-task-config ~/eval_task_config.json \
+--eval-task-config ~/benchmark_config.json \
 --visualize
 ```
 
@@ -309,15 +309,15 @@ llama-stack-client eval run_scoring <scoring_fn_id_1> <scoring_fn_id_2> ... <sco
 --output-dir ./
 ```
 
-#### Defining EvalTaskConfig
-The `EvalTaskConfig` are user specified config to define:
+#### Defining BenchmarkConfig
+The `BenchmarkConfig` are user specified config to define:
 1. `EvalCandidate` to run generation on:
    - `ModelCandidate`: The model will be used for generation through LlamaStack /inference API.
    - `AgentCandidate`: The agentic system specified by AgentConfig will be used for generation through LlamaStack  /agents API.
 2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
 
 
-**Example Benchmark EvalTaskConfig**
+**Example Benchmark BenchmarkConfig**
 ```json
 {
     "type": "benchmark",
@@ -335,7 +335,7 @@ The `EvalTaskConfig` are user specified config to define:
 }
 ```
 
-**Example Application EvalTaskConfig**
+**Example Application BenchmarkConfig**
 ```json
 {
     "type": "app",
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index b1fb7014f..d459726cb 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -161,14 +161,14 @@ Options:
 
 ## Eval Task Management
 
-### `llama-stack-client eval_tasks list`
+### `llama-stack-client benchmarks list`
 ```bash
-$ llama-stack-client eval_tasks list
+$ llama-stack-client benchmarks list
 ```
 
-### `llama-stack-client eval_tasks register`
+### `llama-stack-client benchmarks register`
 ```bash
-$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
+$ llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
 ```
 
 Options:
@@ -191,7 +191,7 @@ Options:
 - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
 - `--visualize`: Optional flag. If set, visualizes evaluation results after completion
 
-Example eval_task_config.json:
+Example benchmark_config.json:
 ```json
 {
     "type": "benchmark",
diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md
index 8a06e2244..9d1130422 100644
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
 
 Methods:
 
-- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
 
 ### Jobs
 
@@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
 
 Methods:
 
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
 
 ## Inspect
 
@@ -443,20 +443,20 @@ Methods:
 - <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
 - <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
 
-## EvalTasks
+## Benchmarks
 
 Types:
 
 ```python
 from llama_stack_client.types import (
-    EvalTask,
-    ListEvalTasksResponse,
-    EvalTaskListResponse,
+    Benchmark,
+    ListBenchmarksResponse,
+    BenchmarkListResponse,
 )
 ```
 
 Methods:
 
-- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
-- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
-- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>
+- <code title="get /v1/eval-tasks/{benchmark_id}">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">retrieve</a>(benchmark_id) -> <a href="./src/llama_stack_client/types/benchmark.py">Optional[Benchmark]</a></code>
+- <code title="get /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">list</a>() -> <a href="./src/llama_stack_client/types/benchmark_list_response.py">BenchmarkListResponse</a></code>
+- <code title="post /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">register</a>(\*\*<a href="src/llama_stack_client/types/benchmark_register_params.py">params</a>) -> None</code>
diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/benchmarks/__init__.py
similarity index 81%
rename from llama_stack/apis/eval_tasks/__init__.py
rename to llama_stack/apis/benchmarks/__init__.py
index 7ca216706..f8f564957 100644
--- a/llama_stack/apis/eval_tasks/__init__.py
+++ b/llama_stack/apis/benchmarks/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .eval_tasks import *  # noqa: F401 F403
+from .benchmarks import *  # noqa: F401 F403
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
new file mode 100644
index 000000000..50019b18c
--- /dev/null
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
+
+from llama_models.schema_utils import json_schema_type, webmethod
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.resource import Resource, ResourceType
+
+
+class CommonBenchmarkFields(BaseModel):
+    dataset_id: str
+    scoring_functions: List[str]
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Metadata for this evaluation task",
+    )
+
+
+@json_schema_type
+class Benchmark(CommonBenchmarkFields, Resource):
+    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
+
+    @property
+    def benchmark_id(self) -> str:
+        return self.identifier
+
+    @property
+    def provider_benchmark_id(self) -> str:
+        return self.provider_resource_id
+
+
+class BenchmarkInput(CommonBenchmarkFields, BaseModel):
+    benchmark_id: str
+    provider_id: Optional[str] = None
+    provider_benchmark_id: Optional[str] = None
+
+
+class ListBenchmarksResponse(BaseModel):
+    data: List[Benchmark]
+
+
+@runtime_checkable
+class Benchmarks(Protocol):
+    @webmethod(route="/eval/benchmarks", method="GET")
+    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
+    async def get_benchmark(
+        self,
+        benchmark_id: str,
+    ) -> Optional[Benchmark]: ...
+
+    @webmethod(route="/eval/benchmarks", method="POST")
+    async def register_benchmark(
+        self,
+        benchmark_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...
+
+    @webmethod(route="/eval-tasks", method="GET")
+    async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
+
+    @webmethod(route="/eval-tasks/{task_id}", method="GET")
+    async def DEPRECATED_get_eval_task(
+        self,
+        eval_task_id: str,
+    ) -> Optional[Benchmark]: ...
+
+    @webmethod(route="/eval-tasks", method="POST")
+    async def DEPRECATED_register_eval_task(
+        self,
+        eval_task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None: ...
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index ccc395b80..0751b2c9b 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -28,7 +28,7 @@ class Api(Enum):
     vector_dbs = "vector_dbs"
     datasets = "datasets"
     scoring_functions = "scoring_functions"
-    eval_tasks = "eval_tasks"
+    benchmarks = "benchmarks"
     tool_groups = "tool_groups"
 
     # built-in API
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index ae13a5bd9..e5c782150 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -38,19 +38,9 @@ EvalCandidate = register_schema(
 
 
 @json_schema_type
-class BenchmarkEvalTaskConfig(BaseModel):
+class BenchmarkConfig(BaseModel):
     type: Literal["benchmark"] = "benchmark"
     eval_candidate: EvalCandidate
-    num_examples: Optional[int] = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
-
-
-@json_schema_type
-class AppEvalTaskConfig(BaseModel):
-    type: Literal["app"] = "app"
-    eval_candidate: EvalCandidate
     scoring_params: Dict[str, ScoringFnParams] = Field(
         description="Map between scoring function id and parameters for each scoring function you want to run",
         default_factory=dict,
@@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel):
     # we could optinally add any specific dataset config here
 
 
-EvalTaskConfig = register_schema(
-    Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
-    name="EvalTaskConfig",
-)
-
-
 @json_schema_type
 class EvaluateResponse(BaseModel):
     generations: List[Dict[str, Any]]
@@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel):
 
 
 class Eval(Protocol):
-    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
     async def run_eval(
+        self,
+        benchmark_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
+    async def evaluate_rows(
+        self,
+        benchmark_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    async def DEPRECATED_run_eval(
         self,
         task_id: str,
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> Job: ...
 
     @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
-    async def evaluate_rows(
+    async def DEPRECATED_evaluate_rows(
         self,
         task_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+    async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel(self, task_id: str, job_id: str) -> None: ...
+    async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
 
     @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
+    async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
deleted file mode 100644
index a0a533055..000000000
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonEvalTaskFields(BaseModel):
-    dataset_id: str
-    scoring_functions: List[str]
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class EvalTask(CommonEvalTaskFields, Resource):
-    type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
-
-    @property
-    def eval_task_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_eval_task_id(self) -> str:
-        return self.provider_resource_id
-
-
-class EvalTaskInput(CommonEvalTaskFields, BaseModel):
-    eval_task_id: str
-    provider_id: Optional[str] = None
-    provider_eval_task_id: Optional[str] = None
-
-
-class ListEvalTasksResponse(BaseModel):
-    data: List[EvalTask]
-
-
-@runtime_checkable
-class EvalTasks(Protocol):
-    @webmethod(route="/eval-tasks", method="GET")
-    async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
-
-    @webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
-    async def get_eval_task(
-        self,
-        eval_task_id: str,
-    ) -> Optional[EvalTask]: ...
-
-    @webmethod(route="/eval-tasks", method="POST")
-    async def register_eval_task(
-        self,
-        eval_task_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_eval_task_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index 145113a5d..70ec63c55 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -15,7 +15,7 @@ class ResourceType(Enum):
     vector_db = "vector_db"
     dataset = "dataset"
     scoring_function = "scoring_function"
-    eval_task = "eval_task"
+    benchmark = "benchmark"
     tool = "tool"
     tool_group = "tool_group"
 
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 97706f22a..f62996081 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -8,10 +8,10 @@ from typing import Annotated, Any, Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field
 
+from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset, DatasetInput
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.models import Model, ModelInput
 from llama_stack.apis.safety import Safety
@@ -37,7 +37,7 @@ RoutableObject = Union[
     VectorDB,
     Dataset,
     ScoringFn,
-    EvalTask,
+    Benchmark,
     Tool,
     ToolGroup,
 ]
@@ -50,7 +50,7 @@ RoutableObjectWithProvider = Annotated[
         VectorDB,
         Dataset,
         ScoringFn,
-        EvalTask,
+        Benchmark,
         Tool,
         ToolGroup,
     ],
@@ -173,7 +173,7 @@ a default SQLite store will be used.""",
     vector_dbs: List[VectorDBInput] = Field(default_factory=list)
     datasets: List[DatasetInput] = Field(default_factory=list)
     scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
-    eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
+    benchmarks: List[BenchmarkInput] = Field(default_factory=list)
     tool_groups: List[ToolGroupInput] = Field(default_factory=list)
 
     server: ServerConfig = Field(
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 2dcf38463..384e2c3c8 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
             router_api=Api.scoring,
         ),
         AutoRoutedApiInfo(
-            routing_table_api=Api.eval_tasks,
+            routing_table_api=Api.benchmarks,
             router_api=Api.eval,
         ),
         AutoRoutedApiInfo(
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 353c2971b..0bc2e774c 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -9,10 +9,10 @@ import logging
 from typing import Any, Dict, List, Set
 
 from llama_stack.apis.agents import Agents
+from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
@@ -37,8 +37,8 @@ from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.datatypes import (
     Api,
+    BenchmarksProtocolPrivate,
     DatasetsProtocolPrivate,
-    EvalTasksProtocolPrivate,
     InlineProviderSpec,
     ModelsProtocolPrivate,
     ProviderSpec,
@@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.scoring: Scoring,
         Api.scoring_functions: ScoringFunctions,
         Api.eval: Eval,
-        Api.eval_tasks: EvalTasks,
+        Api.benchmarks: Benchmarks,
         Api.post_training: PostTraining,
         Api.tool_groups: ToolGroups,
         Api.tool_runtime: ToolRuntime,
@@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
             ScoringFunctions,
             Api.scoring_functions,
         ),
-        Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks),
+        Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
     }
 
 
diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py
index 18197ca7f..a54f57fb3 100644
--- a/llama_stack/distribution/routers/__init__.py
+++ b/llama_stack/distribution/routers/__init__.py
@@ -11,8 +11,8 @@ from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
 
 from .routing_tables import (
+    BenchmarksRoutingTable,
     DatasetsRoutingTable,
-    EvalTasksRoutingTable,
     ModelsRoutingTable,
     ScoringFunctionsRoutingTable,
     ShieldsRoutingTable,
@@ -33,7 +33,7 @@ async def get_routing_table_impl(
         "shields": ShieldsRoutingTable,
         "datasets": DatasetsRoutingTable,
         "scoring_functions": ScoringFunctionsRoutingTable,
-        "eval_tasks": EvalTasksRoutingTable,
+        "benchmarks": BenchmarksRoutingTable,
         "tool_groups": ToolGroupsRoutingTable,
     }
 
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index e716e44b0..f45975189 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -9,9 +9,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
 from llama_stack.apis.eval import (
-    AppEvalTaskConfig,
+    BenchmarkConfig,
     Eval,
-    EvalTaskConfig,
     EvaluateResponse,
     Job,
     JobStatus,
@@ -347,23 +346,23 @@ class EvalRouter(Eval):
 
     async def run_eval(
         self,
-        task_id: str,
-        task_config: AppEvalTaskConfig,
+        benchmark_id: str,
+        task_config: BenchmarkConfig,
     ) -> Job:
-        return await self.routing_table.get_provider_impl(task_id).run_eval(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+            benchmark_id=benchmark_id,
             task_config=task_config,
         )
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+            benchmark_id=benchmark_id,
             input_rows=input_rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -371,30 +370,72 @@ class EvalRouter(Eval):
 
     async def job_status(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> Optional[JobStatus]:
-        return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
+        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
 
     async def job_cancel(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> None:
-        await self.routing_table.get_provider_impl(task_id).job_cancel(
-            task_id,
+        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+            benchmark_id,
             job_id,
         )
 
     async def job_result(
+        self,
+        benchmark_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+            benchmark_id,
+            job_id,
+        )
+
+    async def DEPRECATED_run_eval(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job:
+        return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+    async def DEPRECATED_evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        return await self.evaluate_rows(
+            benchmark_id=task_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            task_config=task_config,
+        )
+
+    async def DEPRECATED_job_status(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> Optional[JobStatus]:
+        return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_cancel(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> None:
+        return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_result(
         self,
         task_id: str,
         job_id: str,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).job_result(
-            task_id,
-            job_id,
-        )
+        return await self.job_result(benchmark_id=task_id, job_id=job_id)
 
 
 class ToolRuntimeRouter(ToolRuntime):
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 009775ca5..2cddc3970 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -4,14 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import logging
 from typing import Any, Dict, List, Optional
 
 from pydantic import TypeAdapter
 
+from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
-from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
 from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.scoring_functions import (
@@ -38,6 +39,8 @@ from llama_stack.distribution.datatypes import (
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
 
+logger = logging.getLogger(__name__)
+
 
 def get_impl_api(p: Any) -> Api:
     return p.__provider_spec__.api
@@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
     elif api == Api.scoring:
         return await p.register_scoring_function(obj)
     elif api == Api.eval:
-        return await p.register_eval_task(obj)
+        return await p.register_benchmark(obj)
     elif api == Api.tool_runtime:
         return await p.register_tool(obj)
     else:
@@ -121,7 +124,7 @@ class CommonRoutingTableImpl(RoutingTable):
                 scoring_functions = await p.list_scoring_functions()
                 await add_objects(scoring_functions, pid, ScoringFn)
             elif api == Api.eval:
-                p.eval_task_store = self
+                p.benchmark_store = self
             elif api == Api.tool_runtime:
                 p.tool_store = self
 
@@ -141,8 +144,8 @@ class CommonRoutingTableImpl(RoutingTable):
                 return ("DatasetIO", "dataset")
             elif isinstance(self, ScoringFunctionsRoutingTable):
                 return ("Scoring", "scoring_function")
-            elif isinstance(self, EvalTasksRoutingTable):
-                return ("Eval", "eval_task")
+            elif isinstance(self, BenchmarksRoutingTable):
+                return ("Eval", "benchmark")
             elif isinstance(self, ToolGroupsRoutingTable):
                 return ("Tools", "tool")
             else:
@@ -428,20 +431,20 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
         await self.register_object(scoring_fn)
 
 
-class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
-    async def list_eval_tasks(self) -> ListEvalTasksResponse:
-        return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
+class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
+        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
 
-    async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]:
-        return await self.get_object_by_identifier("eval_task", eval_task_id)
+    async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
+        return await self.get_object_by_identifier("benchmark", benchmark_id)
 
-    async def register_eval_task(
+    async def register_benchmark(
         self,
-        eval_task_id: str,
+        benchmark_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         metadata: Optional[Dict[str, Any]] = None,
-        provider_eval_task_id: Optional[str] = None,
+        provider_benchmark_id: Optional[str] = None,
         provider_id: Optional[str] = None,
     ) -> None:
         if metadata is None:
@@ -453,17 +456,46 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
                 raise ValueError(
                     "No provider specified and multiple providers available. Please specify a provider_id."
                 )
-        if provider_eval_task_id is None:
-            provider_eval_task_id = eval_task_id
-        eval_task = EvalTask(
-            identifier=eval_task_id,
+        if provider_benchmark_id is None:
+            provider_benchmark_id = benchmark_id
+        benchmark = Benchmark(
+            identifier=benchmark_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,
             provider_id=provider_id,
-            provider_resource_id=provider_eval_task_id,
+            provider_resource_id=provider_benchmark_id,
+        )
+        await self.register_object(benchmark)
+
+    async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.list_benchmarks()
+
+    async def DEPRECATED_get_eval_task(
+        self,
+        eval_task_id: str,
+    ) -> Optional[Benchmark]:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.get_benchmark(eval_task_id)
+
+    async def DEPRECATED_register_eval_task(
+        self,
+        eval_task_id: str,
+        dataset_id: str,
+        scoring_functions: List[str],
+        provider_benchmark_id: Optional[str] = None,
+        provider_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        logger.warning("DEPRECATED: Use /eval/benchmarks instead")
+        return await self.register_benchmark(
+            benchmark_id=eval_task_id,
+            dataset_id=dataset_id,
+            scoring_functions=scoring_functions,
+            metadata=metadata,
+            provider_benchmark_id=provider_benchmark_id,
         )
-        await self.register_object(eval_task)
 
 
 class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 2baad8ac4..9335dc3a9 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -15,10 +15,10 @@ from termcolor import colored
 
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batch_inference import BatchInference
+from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
-from llama_stack.apis.eval_tasks import EvalTasks
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
@@ -53,7 +53,7 @@ class LlamaStack(
     PostTraining,
     VectorIO,
     Eval,
-    EvalTasks,
+    Benchmarks,
     Scoring,
     ScoringFunctions,
     DatasetIO,
@@ -78,7 +78,7 @@ RESOURCES = [
         "register_scoring_function",
         "list_scoring_functions",
     ),
-    ("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"),
+    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
     ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
 ]
 
diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md
index c0a2597af..8fceb5c63 100644
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@@ -26,7 +26,7 @@ $ llama-stack-client datasets register \
 ```
 
 ```bash
-$ llama-stack-client eval_tasks register \
+$ llama-stack-client benchmarks register \
 --eval-task-id meta-reference-mmlu \
 --provider-id meta-reference \
 --dataset-id mmlu \
diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
index f58969663..1428ae9ab 100644
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@@ -8,12 +8,12 @@ import streamlit as st
 from modules.api import llama_stack_api
 
 
-def eval_tasks():
-    # Eval Tasks Section
-    st.header("Eval Tasks")
+def benchmarks():
+    # Benchmarks Section
+    st.header("Benchmarks")
 
-    eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()}
+    benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
 
-    if len(eval_tasks_info) > 0:
-        selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect")
-        st.json(eval_tasks_info[selected_eval_task], expanded=True)
+    if len(benchmarks_info) > 0:
+        selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
+        st.json(benchmarks_info[selected_benchmark], expanded=True)
diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py
index 94b840bcb..684270d4d 100644
--- a/llama_stack/distribution/ui/page/distribution/resources.py
+++ b/llama_stack/distribution/ui/page/distribution/resources.py
@@ -4,8 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from page.distribution.benchmarks import benchmarks
 from page.distribution.datasets import datasets
-from page.distribution.eval_tasks import eval_tasks
 from page.distribution.models import models
 from page.distribution.scoring_functions import scoring_functions
 from page.distribution.shields import shields
@@ -20,7 +20,7 @@ def resources_page():
         "Shields",
         "Scoring Functions",
         "Datasets",
-        "Eval Tasks",
+        "Benchmarks",
     ]
     icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
     selected_resource = option_menu(
@@ -34,8 +34,8 @@ def resources_page():
             },
         },
     )
-    if selected_resource == "Eval Tasks":
-        eval_tasks()
+    if selected_resource == "Benchmarks":
+        benchmarks()
     elif selected_resource == "Vector Databases":
         vector_dbs()
     elif selected_resource == "Datasets":
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index 112d9cff0..f1cae714a 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -11,28 +11,28 @@ import streamlit as st
 from modules.api import llama_stack_api
 
 
-def select_eval_task_1():
-    # Select Eval Tasks
+def select_benchmark_1():
+    # Select Benchmarks
     st.subheader("1. Choose An Eval Task")
-    eval_tasks = llama_stack_api.client.eval_tasks.list()
-    eval_tasks = {et.identifier: et for et in eval_tasks}
-    eval_tasks_names = list(eval_tasks.keys())
-    selected_eval_task = st.selectbox(
+    benchmarks = llama_stack_api.client.benchmarks.list()
+    benchmarks = {et.identifier: et for et in benchmarks}
+    benchmarks_names = list(benchmarks.keys())
+    selected_benchmark = st.selectbox(
         "Choose an eval task.",
-        options=eval_tasks_names,
+        options=benchmarks_names,
         help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
     )
     with st.expander("View Eval Task"):
-        st.json(eval_tasks[selected_eval_task], expanded=True)
+        st.json(benchmarks[selected_benchmark], expanded=True)
 
-    st.session_state["selected_eval_task"] = selected_eval_task
-    st.session_state["eval_tasks"] = eval_tasks
+    st.session_state["selected_benchmark"] = selected_benchmark
+    st.session_state["benchmarks"] = benchmarks
     if st.button("Confirm", key="confirm_1"):
-        st.session_state["selected_eval_task_1_next"] = True
+        st.session_state["selected_benchmark_1_next"] = True
 
 
 def define_eval_candidate_2():
-    if not st.session_state.get("selected_eval_task_1_next", None):
+    if not st.session_state.get("selected_benchmark_1_next", None):
         return
 
     st.subheader("2. Define Eval Candidate")
@@ -161,11 +161,11 @@ def run_evaluation_3():
         Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
         """
     )
-    selected_eval_task = st.session_state["selected_eval_task"]
-    eval_tasks = st.session_state["eval_tasks"]
+    selected_benchmark = st.session_state["selected_benchmark"]
+    benchmarks = st.session_state["benchmarks"]
     eval_candidate = st.session_state["eval_candidate"]
 
-    dataset_id = eval_tasks[selected_eval_task].dataset_id
+    dataset_id = benchmarks[selected_benchmark].dataset_id
     rows = llama_stack_api.client.datasetio.get_rows_paginated(
         dataset_id=dataset_id,
         rows_in_page=-1,
@@ -180,16 +180,16 @@ def run_evaluation_3():
         help="Number of examples from the dataset to evaluate. ",
     )
 
-    eval_task_config = {
+    benchmark_config = {
         "type": "benchmark",
         "eval_candidate": eval_candidate,
         "scoring_params": {},
     }
 
     with st.expander("View Evaluation Task", expanded=True):
-        st.json(eval_tasks[selected_eval_task], expanded=True)
+        st.json(benchmarks[selected_benchmark], expanded=True)
     with st.expander("View Evaluation Task Configuration", expanded=True):
-        st.json(eval_task_config, expanded=True)
+        st.json(benchmark_config, expanded=True)
 
     # Add run button and handle evaluation
     if st.button("Run Evaluation"):
@@ -209,10 +209,10 @@ def run_evaluation_3():
             progress_bar.progress(progress, text=progress_text)
             # Run evaluation for current row
             eval_res = llama_stack_api.client.eval.evaluate_rows(
-                task_id=selected_eval_task,
+                benchmark_id=selected_benchmark,
                 input_rows=[r],
-                scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
-                task_config=eval_task_config,
+                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
+                task_config=benchmark_config,
             )
 
             for k in r.keys():
@@ -225,7 +225,7 @@ def run_evaluation_3():
                     output_res[k] = []
                 output_res[k].append(eval_res.generations[0][k])
 
-            for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
+            for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
                 if scoring_fn not in output_res:
                     output_res[scoring_fn] = []
                 output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
@@ -245,7 +245,7 @@ def native_evaluation_page():
     st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
     st.title("📊 Evaluations (Generation + Scoring)")
 
-    select_eval_task_1()
+    select_benchmark_1()
     define_eval_candidate_2()
     run_evaluation_3()
 
diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py
index ccdaf76e7..b92f9dc0a 100644
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@@ -10,9 +10,9 @@ from urllib.parse import urlparse
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 
+from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasets import Dataset
 from llama_stack.apis.datatypes import Api
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.models import Model
 from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
@@ -48,8 +48,8 @@ class ScoringFunctionsProtocolPrivate(Protocol):
     async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
 
 
-class EvalTasksProtocolPrivate(Protocol):
-    async def register_eval_task(self, eval_task: EvalTask) -> None: ...
+class BenchmarksProtocolPrivate(Protocol):
+    async def register_benchmark(self, benchmark: Benchmark) -> None: ...
 
 
 class ToolsProtocolPrivate(Protocol):
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 1c44caf7f..cd99c9ad8 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -8,13 +8,13 @@ from typing import Any, Dict, List, Optional
 from tqdm import tqdm
 
 from llama_stack.apis.agents import Agents, StepType
+from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval_tasks import EvalTask
 from llama_stack.apis.inference import Inference, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
-from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
+from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
     MEMORY_QUERY_TOOL,
 )
@@ -26,15 +26,15 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 from llama_stack.providers.utils.kvstore import kvstore_impl
 
 from .....apis.common.job_types import Job
-from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
+from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
 from .config import MetaReferenceEvalConfig
 
-EVAL_TASKS_PREFIX = "eval_tasks:"
+EVAL_TASKS_PREFIX = "benchmarks:"
 
 
 class MetaReferenceEvalImpl(
     Eval,
-    EvalTasksProtocolPrivate,
+    BenchmarksProtocolPrivate,
 ):
     def __init__(
         self,
@@ -55,36 +55,36 @@ class MetaReferenceEvalImpl(
         # TODO: assume sync job, will need jobs API for async scheduling
         self.jobs = {}
 
-        self.eval_tasks = {}
+        self.benchmarks = {}
 
     async def initialize(self) -> None:
         self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing eval_tasks from kvstore
+        # Load existing benchmarks from kvstore
         start_key = EVAL_TASKS_PREFIX
         end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_eval_tasks = await self.kvstore.range(start_key, end_key)
+        stored_benchmarks = await self.kvstore.range(start_key, end_key)
 
-        for eval_task in stored_eval_tasks:
-            eval_task = EvalTask.model_validate_json(eval_task)
-            self.eval_tasks[eval_task.identifier] = eval_task
+        for benchmark in stored_benchmarks:
+            benchmark = Benchmark.model_validate_json(benchmark)
+            self.benchmarks[benchmark.identifier] = benchmark
 
     async def shutdown(self) -> None: ...
 
-    async def register_eval_task(self, task_def: EvalTask) -> None:
+    async def register_benchmark(self, task_def: Benchmark) -> None:
         # Store in kvstore
         key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
         await self.kvstore.set(
             key=key,
             value=task_def.model_dump_json(),
         )
-        self.eval_tasks[task_def.identifier] = task_def
+        self.benchmarks[task_def.identifier] = task_def
 
     async def run_eval(
         self,
-        task_id: str,
-        task_config: EvalTaskConfig,
+        benchmark_id: str,
+        task_config: BenchmarkConfig,
     ) -> Job:
-        task_def = self.eval_tasks[task_id]
+        task_def = self.benchmarks[benchmark_id]
         dataset_id = task_def.dataset_id
         candidate = task_config.eval_candidate
         scoring_functions = task_def.scoring_functions
@@ -95,7 +95,7 @@ class MetaReferenceEvalImpl(
             rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
         )
         res = await self.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             input_rows=all_rows.rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -108,7 +108,7 @@ class MetaReferenceEvalImpl(
         return Job(job_id=job_id)
 
     async def _run_agent_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
     ) -> List[Dict[str, Any]]:
         candidate = task_config.eval_candidate
         create_response = await self.agents_api.create_agent(candidate.config)
@@ -151,7 +151,7 @@ class MetaReferenceEvalImpl(
         return generations
 
     async def _run_model_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
+        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
     ) -> List[Dict[str, Any]]:
         candidate = task_config.eval_candidate
         assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
@@ -187,10 +187,10 @@ class MetaReferenceEvalImpl(
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
-        task_config: EvalTaskConfig,
+        task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
         candidate = task_config.eval_candidate
         if candidate.type == "agent":
@@ -203,7 +203,7 @@ class MetaReferenceEvalImpl(
         # scoring with generated_answer
         score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
 
-        if task_config.type == "app" and task_config.scoring_params is not None:
+        if task_config.scoring_params is not None:
             scoring_functions_dict = {
                 scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
                 for scoring_fn_id in scoring_functions
@@ -217,18 +217,60 @@ class MetaReferenceEvalImpl(
 
         return EvaluateResponse(generations=generations, scores=score_response.results)
 
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
         if job_id in self.jobs:
             return JobStatus.completed
 
         return None
 
-    async def job_cancel(self, task_id: str, job_id: str) -> None:
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
         raise NotImplementedError("Job cancel is not implemented yet")
 
-    async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
-        status = await self.job_status(task_id, job_id)
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        status = await self.job_status(benchmark_id, job_id)
         if not status or status != JobStatus.completed:
             raise ValueError(f"Job is not completed, Status: {status.value}")
 
         return self.jobs[job_id]
+
+    async def DEPRECATED_run_eval(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job:
+        return await self.run_eval(benchmark_id=task_id, task_config=task_config)
+
+    async def DEPRECATED_evaluate_rows(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse:
+        return await self.evaluate_rows(
+            benchmark_id=task_id,
+            input_rows=input_rows,
+            scoring_functions=scoring_functions,
+            task_config=task_config,
+        )
+
+    async def DEPRECATED_job_status(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> Optional[JobStatus]:
+        return await self.job_status(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_cancel(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> None:
+        return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
+
+    async def DEPRECATED_job_result(
+        self,
+        task_id: str,
+        job_id: str,
+    ) -> EvaluateResponse:
+        return await self.job_result(benchmark_id=task_id, job_id=job_id)
diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py
index ec3d08728..ad80b8601 100644
--- a/llama_stack/providers/tests/eval/test_eval.py
+++ b/llama_stack/providers/tests/eval/test_eval.py
@@ -10,8 +10,8 @@ import pytest
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
 from llama_stack.apis.eval.eval import (
-    AppEvalTaskConfig,
-    BenchmarkEvalTaskConfig,
+    AppBenchmarkConfig,
+    BenchmarkBenchmarkConfig,
     ModelCandidate,
 )
 from llama_stack.apis.inference import SamplingParams
@@ -30,18 +30,18 @@ from .constants import JUDGE_PROMPT
 
 class Testeval:
     @pytest.mark.asyncio
-    async def test_eval_tasks_list(self, eval_stack):
+    async def test_benchmarks_list(self, eval_stack):
         # NOTE: this needs you to ensure that you are starting from a clean state
         # but so far we don't have an unregister API unfortunately, so be careful
-        eval_tasks_impl = eval_stack[Api.eval_tasks]
-        response = await eval_tasks_impl.list_eval_tasks()
+        benchmarks_impl = eval_stack[Api.benchmarks]
+        response = await benchmarks_impl.list_benchmarks()
         assert isinstance(response, list)
 
     @pytest.mark.asyncio
     async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasetio],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
@@ -59,17 +59,17 @@ class Testeval:
         scoring_functions = [
             "basic::equality",
         ]
-        task_id = "meta-reference::app_eval"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        benchmark_id = "meta-reference::app_eval"
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=benchmark_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
         response = await eval_impl.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             input_rows=rows.rows,
             scoring_functions=scoring_functions,
-            task_config=AppEvalTaskConfig(
+            task_config=AppBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -92,9 +92,9 @@ class Testeval:
 
     @pytest.mark.asyncio
     async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
         )
@@ -105,15 +105,15 @@ class Testeval:
             "basic::subset_of",
         ]
 
-        task_id = "meta-reference::app_eval-2"
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id=task_id,
+        benchmark_id = "meta-reference::app_eval-2"
+        await benchmarks_impl.register_benchmark(
+            benchmark_id=benchmark_id,
             dataset_id="test_dataset_for_eval",
             scoring_functions=scoring_functions,
         )
         response = await eval_impl.run_eval(
-            task_id=task_id,
-            task_config=AppEvalTaskConfig(
+            benchmark_id=benchmark_id,
+            task_config=AppBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
@@ -121,9 +121,9 @@ class Testeval:
             ),
         )
         assert response.job_id == "0"
-        job_status = await eval_impl.job_status(task_id, response.job_id)
+        job_status = await eval_impl.job_status(benchmark_id, response.job_id)
         assert job_status and job_status.value == "completed"
-        eval_response = await eval_impl.job_result(task_id, response.job_id)
+        eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
 
         assert eval_response is not None
         assert len(eval_response.generations) == 5
@@ -131,9 +131,9 @@ class Testeval:
 
     @pytest.mark.asyncio
     async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
-        eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
+        eval_impl, benchmarks_impl, datasets_impl, models_impl = (
             eval_stack[Api.eval],
-            eval_stack[Api.eval_tasks],
+            eval_stack[Api.benchmarks],
             eval_stack[Api.datasets],
             eval_stack[Api.models],
         )
@@ -159,20 +159,20 @@ class Testeval:
         )
 
         # register eval task
-        await eval_tasks_impl.register_eval_task(
-            eval_task_id="meta-reference-mmlu",
+        await benchmarks_impl.register_benchmark(
+            benchmark_id="meta-reference-mmlu",
             dataset_id="mmlu",
             scoring_functions=["basic::regex_parser_multiple_choice_answer"],
         )
 
         # list benchmarks
-        response = await eval_tasks_impl.list_eval_tasks()
+        response = await benchmarks_impl.list_benchmarks()
         assert len(response) > 0
 
         benchmark_id = "meta-reference-mmlu"
         response = await eval_impl.run_eval(
-            task_id=benchmark_id,
-            task_config=BenchmarkEvalTaskConfig(
+            benchmark_id=benchmark_id,
+            task_config=BenchmarkBenchmarkConfig(
                 eval_candidate=ModelCandidate(
                     model=inference_model,
                     sampling_params=SamplingParams(),
diff --git a/llama_stack/providers/tests/resolver.py b/llama_stack/providers/tests/resolver.py
index 0ff632717..76343b7f4 100644
--- a/llama_stack/providers/tests/resolver.py
+++ b/llama_stack/providers/tests/resolver.py
@@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional
 
 from pydantic import BaseModel
 
+from llama_stack.apis.benchmarks import BenchmarkInput
 from llama_stack.apis.datasets import DatasetInput
-from llama_stack.apis.eval_tasks import EvalTaskInput
 from llama_stack.apis.models import ModelInput
 from llama_stack.apis.scoring_functions import ScoringFnInput
 from llama_stack.apis.shields import ShieldInput
@@ -42,7 +42,7 @@ async def construct_stack_for_test(
     vector_dbs: Optional[List[VectorDBInput]] = None,
     datasets: Optional[List[DatasetInput]] = None,
     scoring_fns: Optional[List[ScoringFnInput]] = None,
-    eval_tasks: Optional[List[EvalTaskInput]] = None,
+    benchmarks: Optional[List[BenchmarkInput]] = None,
     tool_groups: Optional[List[ToolGroupInput]] = None,
 ) -> TestStack:
     sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
@@ -56,7 +56,7 @@ async def construct_stack_for_test(
         vector_dbs=vector_dbs or [],
         datasets=datasets or [],
         scoring_fns=scoring_fns or [],
-        eval_tasks=eval_tasks or [],
+        benchmarks=benchmarks or [],
         tool_groups=tool_groups or [],
     )
     run_config = parse_and_maybe_upgrade_config(run_config)
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index be6c9a928..7d03b7c29 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index 05d3f4525..6afff2be2 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index 04c5957d4..ddec3a715 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -108,7 +108,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: brave-search
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 706444eb1..9394c94ef 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -99,7 +99,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: brave-search
diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml
index 75d103c9f..e70ccdd2d 100644
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@@ -85,4 +85,4 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index 0fbe14a5a..8f95e9d59 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -164,7 +164,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index ccf67dcbb..64229a5d8 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -153,7 +153,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index f520a2fda..867d7a076 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 708cb1bcc..d60acdefd 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -106,7 +106,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index 7f0abf5be..e58ad15b3 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -116,7 +116,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index c0b7a4c60..5045e821a 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -106,7 +106,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index c5286fc6b..caac65c8c 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index 310585f23..bade9a076 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
index d43cf3917..f131e8ea6 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index c8ae362f5..14fb28354 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -139,7 +139,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index ac5dab755..9d5bfc7a0 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -113,7 +113,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 3a60fe61f..9ac1f3267 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -110,7 +110,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 1fe998a1f..dd43f21f6 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 9d3db8a31..24cd207c7 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -107,7 +107,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 39b0f3c4e..26815dcd0 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -118,7 +118,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index ed6c9ef6f..e1d85f59a 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -106,7 +106,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index 8bf76f37b..fc73e0978 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -105,7 +105,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index 298926630..f101a5d60 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -159,7 +159,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 920003759..8af85979d 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -148,7 +148,7 @@ shields:
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 41a545e1a..cdce5510d 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -109,7 +109,7 @@ shields: []
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search