From ec721b3867d664a486faebb6a2a2b7a77ecd0b71 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:48:05 -0800
Subject: [PATCH] update

---
 docs/_static/llama-stack-spec.html          | 619 +++++++++++++-------
 docs/_static/llama-stack-spec.yaml          | 382 ++++++++----
 llama_stack/apis/eval/eval.py               |  25 +
 llama_stack/distribution/routers/routers.py |   3 +-
 4 files changed, 695 insertions(+), 334 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index c656808a6..652dae562 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -67,8 +67,8 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "benchmark_id",
-                        "in": "path",
+                        "name": "task_id",
+                        "in": "query",
                         "required": true,
                         "schema": {
                             "type": "string"
@@ -114,7 +114,7 @@
                     "content": {
                         "application/json": {
                             "schema": {
-                                "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest"
+                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
                             }
                         }
                     },
@@ -613,7 +613,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/evaluations": {
+        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
             "post": {
                 "responses": {
                     "200": {
@@ -653,6 +653,47 @@
                 }
             }
         },
+        "/v1/eval/tasks/{task_id}/evaluations": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
         "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
             "get": {
                 "responses": {
@@ -753,6 +794,43 @@
                 ]
             }
         },
+        "/v1/eval/benchmarks/{benchmark_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Benchmark"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/datasets/{dataset_id}": {
             "get": {
                 "responses": {
@@ -811,43 +889,6 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{benchmark_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Benchmark"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Benchmarks"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
         "/v1/models/{model_id}": {
             "get": {
                 "responses": {
@@ -1431,7 +1472,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1505,7 +1546,83 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/JobStatus"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
             "get": {
                 "responses": {
                     "200": {
@@ -1525,7 +1642,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "job_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1533,7 +1650,7 @@
                         }
                     },
                     {
-                        "name": "benchmark_id",
+                        "name": "job_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1543,6 +1660,88 @@
                 ]
             }
         },
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/benchmarks": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": []
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/datasets": {
             "get": {
                 "responses": {
@@ -1586,49 +1785,6 @@
                 }
             }
         },
-        "/v1/eval/tasks": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Benchmarks"
-                ],
-                "description": "",
-                "parameters": []
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Benchmarks"
-                ],
-                "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/models": {
             "get": {
                 "responses": {
@@ -2204,7 +2360,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{benchmark_id}/jobs": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
             "post": {
                 "responses": {
                     "200": {
@@ -2244,6 +2400,47 @@
                 }
             }
         },
+        "/v1/eval/tasks/{task_id}/jobs": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Job"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunEvalDeprecatedRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
         "/v1/safety/run-shield": {
             "post": {
                 "responses": {
@@ -2526,10 +2723,10 @@
                     "data"
                 ]
             },
-            "DeprecatedRegisterBenchmarkRequest": {
+            "DeprecatedRegisterEvalTaskRequest": {
                 "type": "object",
                 "properties": {
-                    "benchmark_id": {
+                    "task_id": {
                         "type": "string"
                     },
                     "dataset_id": {
@@ -2575,7 +2772,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "benchmark_id",
+                    "task_id",
                     "dataset_id",
                     "scoring_functions"
                 ]
@@ -4745,34 +4942,6 @@
                     "accuracy"
                 ]
             },
-            "AppBenchmarkConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "app",
-                        "default": "app"
-                    },
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        }
-                    },
-                    "num_examples": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "eval_candidate",
-                    "scoring_params"
-                ]
-            },
             "BasicScoringFnParams": {
                 "type": "object",
                 "properties": {
@@ -4793,25 +4962,26 @@
                     "type"
                 ]
             },
-            "BenchmarkBenchmarkConfig": {
+            "BenchmarkConfig": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "benchmark",
-                        "default": "benchmark"
-                    },
                     "eval_candidate": {
                         "$ref": "#/components/schemas/EvalCandidate"
                     },
+                    "scoring_params": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringFnParams"
+                        }
+                    },
                     "num_examples": {
                         "type": "integer"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "eval_candidate"
+                    "eval_candidate",
+                    "scoring_params"
                 ]
             },
             "EvalCandidate": {
@@ -4831,23 +5001,6 @@
                     }
                 }
             },
-            "BenchmarkConfig": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/BenchmarkBenchmarkConfig"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AppBenchmarkConfig"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig",
-                        "app": "#/components/schemas/AppBenchmarkConfig"
-                    }
-                }
-            },
             "LLMAsJudgeScoringFnParams": {
                 "type": "object",
                 "properties": {
@@ -5108,6 +5261,54 @@
                     "aggregated_results"
                 ]
             },
+            "EvaluateRowsDeprecatedRequest": {
+                "type": "object",
+                "properties": {
+                    "input_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_rows",
+                    "scoring_functions",
+                    "task_config"
+                ]
+            },
             "Session": {
                 "type": "object",
                 "properties": {
@@ -7304,60 +7505,6 @@
                     "data"
                 ]
             },
-            "RegisterDatasetRequest": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "dataset_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ParamType"
-                        }
-                    },
-                    "url": {
-                        "$ref": "#/components/schemas/URL"
-                    },
-                    "provider_dataset_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "dataset_id",
-                    "dataset_schema",
-                    "url"
-                ]
-            },
             "RegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
@@ -7412,6 +7559,60 @@
                     "scoring_functions"
                 ]
             },
+            "RegisterDatasetRequest": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "dataset_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ParamType"
+                        }
+                    },
+                    "url": {
+                        "$ref": "#/components/schemas/URL"
+                    },
+                    "provider_dataset_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_id",
+                    "dataset_schema",
+                    "url"
+                ]
+            },
             "RegisterModelRequest": {
                 "type": "object",
                 "properties": {
@@ -7623,6 +7824,18 @@
                     "job_id"
                 ]
             },
+            "RunEvalDeprecatedRequest": {
+                "type": "object",
+                "properties": {
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task_config"
+                ]
+            },
             "RunShieldRequest": {
                 "type": "object",
                 "properties": {
@@ -8105,6 +8318,9 @@
         {
             "name": "BatchInference (Coming Soon)"
         },
+        {
+            "name": "Benchmarks"
+        },
         {
             "name": "DatasetIO"
         },
@@ -8114,9 +8330,6 @@
         {
             "name": "Eval"
         },
-        {
-            "name": "Benchmarks"
-        },
         {
             "name": "Inference",
             "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@@ -8168,10 +8381,10 @@
             "tags": [
                 "Agents",
                 "BatchInference (Coming Soon)",
+                "Benchmarks",
                 "DatasetIO",
                 "Datasets",
                 "Eval",
-                "Benchmarks",
                 "Inference",
                 "Inspect",
                 "Models",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 0f0a613a8..89e066917 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -25,8 +25,8 @@ paths:
         - Benchmarks
       description: ''
       parameters:
-        - name: benchmark_id
-          in: path
+        - name: task_id
+          in: query
           required: true
           schema:
             type: string
@@ -57,7 +57,7 @@ paths:
         content:
           application/json:
             schema:
-              $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest'
+              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
         required: true
       deprecated: true
   /v1/datasetio/rows:
@@ -372,7 +372,7 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/tasks/{benchmark_id}/evaluations:
+  /v1/eval/benchmarks/{benchmark_id}/evaluations:
     post:
       responses:
         '200':
@@ -396,6 +396,31 @@ paths:
             schema:
               $ref: '#/components/schemas/EvaluateRowsRequest'
         required: true
+  /v1/eval/tasks/{task_id}/evaluations:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest'
+        required: true
+      deprecated: true
   /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
     get:
       responses:
@@ -457,6 +482,26 @@ paths:
           required: true
           schema:
             type: string
+  /v1/eval/benchmarks/{benchmark_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/Benchmark'
+                  - type: 'null'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters:
+        - name: benchmark_id
+          in: path
+          required: true
+          schema:
+            type: string
   /v1/datasets/{dataset_id}:
     get:
       responses:
@@ -490,26 +535,6 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Benchmark'
-                  - type: 'null'
-      tags:
-        - Benchmarks
-      description: ''
-      parameters:
-        - name: benchmark_id
-          in: path
-          required: true
-          schema:
-            type: string
   /v1/models/{model_id}:
     get:
       responses:
@@ -852,7 +877,7 @@ paths:
             schema:
               $ref: '#/components/schemas/InvokeToolRequest'
         required: true
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
     get:
       responses:
         '200':
@@ -895,7 +920,52 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
+  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/JobStatus'
+                  - type: 'null'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+    delete:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
     get:
       responses:
         '200':
@@ -908,16 +978,67 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: job_id
-          in: path
-          required: true
-          schema:
-            type: string
         - name: benchmark_id
           in: path
           required: true
           schema:
             type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/benchmarks:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBenchmarksResponse'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterBenchmarkRequest'
+        required: true
   /v1/datasets:
     get:
       responses:
@@ -945,33 +1066,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RegisterDatasetRequest'
         required: true
-  /v1/eval/tasks:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-      tags:
-        - Benchmarks
-      description: ''
-      parameters: []
-    post:
-      responses:
-        '200':
-          description: OK
-      tags:
-        - Benchmarks
-      description: ''
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
   /v1/models:
     get:
       responses:
@@ -1328,7 +1422,7 @@ paths:
             type: array
             items:
               type: string
-  /v1/eval/tasks/{benchmark_id}/jobs:
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
     post:
       responses:
         '200':
@@ -1352,6 +1446,31 @@ paths:
             schema:
               $ref: '#/components/schemas/RunEvalRequest'
         required: true
+  /v1/eval/tasks/{task_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunEvalDeprecatedRequest'
+        required: true
+      deprecated: true
   /v1/safety/run-shield:
     post:
       responses:
@@ -1527,10 +1646,10 @@ components:
       additionalProperties: false
       required:
         - data
-    DeprecatedRegisterBenchmarkRequest:
+    DeprecatedRegisterEvalTaskRequest:
       type: object
       properties:
-        benchmark_id:
+        task_id:
           type: string
         dataset_id:
           type: string
@@ -1554,7 +1673,7 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - benchmark_id
+        - task_id
         - dataset_id
         - scoring_functions
     AppendRowsRequest:
@@ -3063,26 +3182,6 @@ components:
         - median
         - categorical_count
         - accuracy
-    AppBenchmarkConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: app
-          default: app
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-        num_examples:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - eval_candidate
-        - scoring_params
     BasicScoringFnParams:
       type: object
       properties:
@@ -3097,21 +3196,21 @@ components:
       additionalProperties: false
       required:
         - type
-    BenchmarkBenchmarkConfig:
+    BenchmarkConfig:
       type: object
       properties:
-        type:
-          type: string
-          const: benchmark
-          default: benchmark
         eval_candidate:
           $ref: '#/components/schemas/EvalCandidate'
+        scoring_params:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringFnParams'
         num_examples:
           type: integer
       additionalProperties: false
       required:
-        - type
         - eval_candidate
+        - scoring_params
     EvalCandidate:
       oneOf:
         - $ref: '#/components/schemas/ModelCandidate'
@@ -3121,15 +3220,6 @@ components:
         mapping:
           model: '#/components/schemas/ModelCandidate'
           agent: '#/components/schemas/AgentCandidate'
-    BenchmarkConfig:
-      oneOf:
-        - $ref: '#/components/schemas/BenchmarkBenchmarkConfig'
-        - $ref: '#/components/schemas/AppBenchmarkConfig'
-      discriminator:
-        propertyName: type
-        mapping:
-          benchmark: '#/components/schemas/BenchmarkBenchmarkConfig'
-          app: '#/components/schemas/AppBenchmarkConfig'
     LLMAsJudgeScoringFnParams:
       type: object
       properties:
@@ -3278,6 +3368,32 @@ components:
       required:
         - score_rows
         - aggregated_results
+    EvaluateRowsDeprecatedRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+        - task_config
     Session:
       type: object
       properties:
@@ -4645,36 +4761,6 @@ components:
       additionalProperties: false
       required:
         - data
-    RegisterDatasetRequest:
-      type: object
-      properties:
-        dataset_id:
-          type: string
-        dataset_schema:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ParamType'
-        url:
-          $ref: '#/components/schemas/URL'
-        provider_dataset_id:
-          type: string
-        provider_id:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - dataset_id
-        - dataset_schema
-        - url
     RegisterBenchmarkRequest:
       type: object
       properties:
@@ -4705,6 +4791,36 @@ components:
         - benchmark_id
         - dataset_id
         - scoring_functions
+    RegisterDatasetRequest:
+      type: object
+      properties:
+        dataset_id:
+          type: string
+        dataset_schema:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ParamType'
+        url:
+          $ref: '#/components/schemas/URL'
+        provider_dataset_id:
+          type: string
+        provider_id:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - dataset_id
+        - dataset_schema
+        - url
     RegisterModelRequest:
       type: object
       properties:
@@ -4827,6 +4943,14 @@ components:
       additionalProperties: false
       required:
         - job_id
+    RunEvalDeprecatedRequest:
+      type: object
+      properties:
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - task_config
     RunShieldRequest:
       type: object
       properties:
@@ -5125,10 +5249,10 @@ tags:
     x-displayName: >-
       Agents API for creating and interacting with agentic systems.
   - name: BatchInference (Coming Soon)
+  - name: Benchmarks
   - name: DatasetIO
   - name: Datasets
   - name: Eval
-  - name: Benchmarks
   - name: Inference
     description: >-
       This API provides the raw interface to the underlying models. Two kinds of models
@@ -5159,10 +5283,10 @@ x-tagGroups:
     tags:
       - Agents
       - BatchInference (Coming Soon)
+      - Benchmarks
       - DatasetIO
       - Datasets
       - Eval
-      - Benchmarks
       - Inference
       - Inspect
       - Models
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 90b14131f..b805e4976 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -83,3 +83,28 @@ class Eval(Protocol):
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
     async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    async def run_eval_DEPRECATED(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+    async def evaluate_rows_DEPRECATED(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
+    async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
+    async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index f9f306767..9945ad367 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -9,7 +9,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
 from llama_stack.apis.common.content_types import InterleavedContent, URL
 from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
 from llama_stack.apis.eval import (
-    AppBenchmarkConfig,
     BenchmarkConfig,
     Eval,
     EvaluateResponse,
@@ -348,7 +347,7 @@ class EvalRouter(Eval):
     async def run_eval(
         self,
         benchmark_id: str,
-        task_config: AppBenchmarkConfig,
+        task_config: BenchmarkConfig,
     ) -> Job:
         return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
             benchmark_id=benchmark_id,