update

2026-01-07 23:51:29 +00:00 · 2025-02-12 20:48:05 -08:00 · 2025-02-12 20:48:05 -08:00 · ec721b3867
commit ec721b3867
parent e07776fff6
4 changed files with 695 additions and 334 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -67,8 +67,8 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "benchmark_id",
-                        "in": "path",
+                        "name": "task_id",
+                        "in": "query",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -114,7 +114,7 @@
                    "content": {
                        "application/json": {
                            "schema": {
-                                "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest"
+                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
                            }
                        }
                    },
@ -613,7 +613,7 @@
                }
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/evaluations": {
+        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
            "post": {
                "responses": {
                    "200": {
@ -653,6 +653,47 @@
                }
            }
        },
+        "/v1/eval/tasks/{task_id}/evaluations": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
        "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
            "get": {
                "responses": {
@ -753,6 +794,43 @@
                ]
            }
        },
+        "/v1/eval/benchmarks/{benchmark_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/Benchmark"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
        "/v1/datasets/{dataset_id}": {
            "get": {
                "responses": {
@ -811,43 +889,6 @@
                ]
            }
        },
-        "/v1/eval/tasks/{benchmark_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "oneOf": [
-                                        {
-                                            "$ref": "#/components/schemas/Benchmark"
-                                        },
-                                        {
-                                            "type": "null"
-                                        }
-                                    ]
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Benchmarks"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "benchmark_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            }
-        },
        "/v1/models/{model_id}": {
            "get": {
                "responses": {
@ -1431,7 +1472,7 @@
                }
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
@ -1505,7 +1546,83 @@
                ]
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "oneOf": [
+                                        {
+                                            "$ref": "#/components/schemas/JobStatus"
+                                        },
+                                        {
+                                            "type": "null"
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
            "get": {
                "responses": {
                    "200": {
@ -1525,7 +1642,7 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "job_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -1533,7 +1650,7 @@
                        }
                    },
                    {
-                        "name": "benchmark_id",
+                        "name": "job_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -1543,6 +1660,88 @@
                ]
            }
        },
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/EvaluateResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "job_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/eval/benchmarks": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": []
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
        "/v1/datasets": {
            "get": {
                "responses": {
@ -1586,49 +1785,6 @@
                }
            }
        },
-        "/v1/eval/tasks": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
-                                }
-                            }
-                        }
-                    }
-                },
-                "tags": [
-                    "Benchmarks"
-                ],
-                "description": "",
-                "parameters": []
-            },
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    }
-                },
-                "tags": [
-                    "Benchmarks"
-                ],
-                "description": "",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1/models": {
            "get": {
                "responses": {
@ -2204,7 +2360,7 @@
                ]
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/jobs": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
            "post": {
                "responses": {
                    "200": {
@ -2244,6 +2400,47 @@
                }
            }
        },
+        "/v1/eval/tasks/{task_id}/jobs": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Job"
+                                }
+                            }
+                        }
+                    }
+                },
+                "tags": [
+                    "Eval"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "task_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/RunEvalDeprecatedRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
        "/v1/safety/run-shield": {
            "post": {
                "responses": {
@ -2526,10 +2723,10 @@
                    "data"
                ]
            },
-            "DeprecatedRegisterBenchmarkRequest": {
+            "DeprecatedRegisterEvalTaskRequest": {
                "type": "object",
                "properties": {
-                    "benchmark_id": {
+                    "task_id": {
                        "type": "string"
                    },
                    "dataset_id": {
@ -2575,7 +2772,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "benchmark_id",
+                    "task_id",
                    "dataset_id",
                    "scoring_functions"
                ]
@ -4745,34 +4942,6 @@
                    "accuracy"
                ]
            },
-            "AppBenchmarkConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "app",
-                        "default": "app"
-                    },
-                    "eval_candidate": {
-                        "$ref": "#/components/schemas/EvalCandidate"
-                    },
-                    "scoring_params": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ScoringFnParams"
-                        }
-                    },
-                    "num_examples": {
-                        "type": "integer"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "eval_candidate",
-                    "scoring_params"
-                ]
-            },
            "BasicScoringFnParams": {
                "type": "object",
                "properties": {
@ -4793,25 +4962,26 @@
                    "type"
                ]
            },
-            "BenchmarkBenchmarkConfig": {
+            "BenchmarkConfig": {
                "type": "object",
                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "benchmark",
-                        "default": "benchmark"
-                    },
                    "eval_candidate": {
                        "$ref": "#/components/schemas/EvalCandidate"
                    },
+                    "scoring_params": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ScoringFnParams"
+                        }
+                    },
                    "num_examples": {
                        "type": "integer"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "type",
-                    "eval_candidate"
+                    "eval_candidate",
+                    "scoring_params"
                ]
            },
            "EvalCandidate": {
@ -4831,23 +5001,6 @@
                    }
                }
            },
-            "BenchmarkConfig": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/BenchmarkBenchmarkConfig"
-                    },
-                    {
-                        "$ref": "#/components/schemas/AppBenchmarkConfig"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig",
-                        "app": "#/components/schemas/AppBenchmarkConfig"
-                    }
-                }
-            },
            "LLMAsJudgeScoringFnParams": {
                "type": "object",
                "properties": {
@ -5108,6 +5261,54 @@
                    "aggregated_results"
                ]
            },
+            "EvaluateRowsDeprecatedRequest": {
+                "type": "object",
+                "properties": {
+                    "input_rows": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    },
+                    "scoring_functions": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_rows",
+                    "scoring_functions",
+                    "task_config"
+                ]
+            },
            "Session": {
                "type": "object",
                "properties": {
@ -7304,60 +7505,6 @@
                    "data"
                ]
            },
-            "RegisterDatasetRequest": {
-                "type": "object",
-                "properties": {
-                    "dataset_id": {
-                        "type": "string"
-                    },
-                    "dataset_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "$ref": "#/components/schemas/ParamType"
-                        }
-                    },
-                    "url": {
-                        "$ref": "#/components/schemas/URL"
-                    },
-                    "provider_dataset_id": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "dataset_id",
-                    "dataset_schema",
-                    "url"
-                ]
-            },
            "RegisterBenchmarkRequest": {
                "type": "object",
                "properties": {
@ -7412,6 +7559,60 @@
                    "scoring_functions"
                ]
            },
+            "RegisterDatasetRequest": {
+                "type": "object",
+                "properties": {
+                    "dataset_id": {
+                        "type": "string"
+                    },
+                    "dataset_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "$ref": "#/components/schemas/ParamType"
+                        }
+                    },
+                    "url": {
+                        "$ref": "#/components/schemas/URL"
+                    },
+                    "provider_dataset_id": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "dataset_id",
+                    "dataset_schema",
+                    "url"
+                ]
+            },
            "RegisterModelRequest": {
                "type": "object",
                "properties": {
@ -7623,6 +7824,18 @@
                    "job_id"
                ]
            },
+            "RunEvalDeprecatedRequest": {
+                "type": "object",
+                "properties": {
+                    "task_config": {
+                        "$ref": "#/components/schemas/BenchmarkConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "task_config"
+                ]
+            },
            "RunShieldRequest": {
                "type": "object",
                "properties": {
@ -8105,6 +8318,9 @@
        {
            "name": "BatchInference (Coming Soon)"
        },
+        {
+            "name": "Benchmarks"
+        },
        {
            "name": "DatasetIO"
        },
@ -8114,9 +8330,6 @@
        {
            "name": "Eval"
        },
-        {
-            "name": "Benchmarks"
-        },
        {
            "name": "Inference",
            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@ -8168,10 +8381,10 @@
            "tags": [
                "Agents",
                "BatchInference (Coming Soon)",
+                "Benchmarks",
                "DatasetIO",
                "Datasets",
                "Eval",
-                "Benchmarks",
                "Inference",
                "Inspect",
                "Models",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -25,8 +25,8 @@ paths:
        - Benchmarks
      description: ''
      parameters:
-        - name: benchmark_id
-          in: path
+        - name: task_id
+          in: query
          required: true
          schema:
            type: string
@ -57,7 +57,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest'
+              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
        required: true
      deprecated: true
  /v1/datasetio/rows:
@ -372,7 +372,7 @@ paths:
            schema:
              $ref: '#/components/schemas/EmbeddingsRequest'
        required: true
-  /v1/eval/tasks/{benchmark_id}/evaluations:
+  /v1/eval/benchmarks/{benchmark_id}/evaluations:
    post:
      responses:
        '200':
@ -396,6 +396,31 @@ paths:
            schema:
              $ref: '#/components/schemas/EvaluateRowsRequest'
        required: true
+  /v1/eval/tasks/{task_id}/evaluations:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest'
+        required: true
+      deprecated: true
  /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
    get:
      responses:
@ -457,6 +482,26 @@ paths:
          required: true
          schema:
            type: string
+  /v1/eval/benchmarks/{benchmark_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/Benchmark'
+                  - type: 'null'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters:
+        - name: benchmark_id
+          in: path
+          required: true
+          schema:
+            type: string
  /v1/datasets/{dataset_id}:
    get:
      responses:
@ -490,26 +535,6 @@ paths:
          required: true
          schema:
            type: string
-  /v1/eval/tasks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/Benchmark'
-                  - type: 'null'
-      tags:
-        - Benchmarks
-      description: ''
-      parameters:
-        - name: benchmark_id
-          in: path
-          required: true
-          schema:
-            type: string
  /v1/models/{model_id}:
    get:
      responses:
@ -852,7 +877,7 @@ paths:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
@ -895,7 +920,52 @@ paths:
          required: true
          schema:
            type: string
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
+  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/JobStatus'
+                  - type: 'null'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+    delete:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
    get:
      responses:
        '200':
@ -908,16 +978,67 @@ paths:
        - Eval
      description: ''
      parameters:
-        - name: job_id
-          in: path
-          required: true
-          schema:
-            type: string
        - name: benchmark_id
          in: path
          required: true
          schema:
            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/EvaluateResponse'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/eval/benchmarks:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBenchmarksResponse'
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+    post:
+      responses:
+        '200':
+          description: OK
+      tags:
+        - Benchmarks
+      description: ''
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RegisterBenchmarkRequest'
+        required: true
  /v1/datasets:
    get:
      responses:
@ -945,33 +1066,6 @@ paths:
            schema:
              $ref: '#/components/schemas/RegisterDatasetRequest'
        required: true
-  /v1/eval/tasks:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-      tags:
-        - Benchmarks
-      description: ''
-      parameters: []
-    post:
-      responses:
-        '200':
-          description: OK
-      tags:
-        - Benchmarks
-      description: ''
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
  /v1/models:
    get:
      responses:
@ -1328,7 +1422,7 @@ paths:
            type: array
            items:
              type: string
-  /v1/eval/tasks/{benchmark_id}/jobs:
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
    post:
      responses:
        '200':
@ -1352,6 +1446,31 @@ paths:
            schema:
              $ref: '#/components/schemas/RunEvalRequest'
        required: true
+  /v1/eval/tasks/{task_id}/jobs:
+    post:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Job'
+      tags:
+        - Eval
+      description: ''
+      parameters:
+        - name: task_id
+          in: path
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunEvalDeprecatedRequest'
+        required: true
+      deprecated: true
  /v1/safety/run-shield:
    post:
      responses:
@ -1527,10 +1646,10 @@ components:
      additionalProperties: false
      required:
        - data
-    DeprecatedRegisterBenchmarkRequest:
+    DeprecatedRegisterEvalTaskRequest:
      type: object
      properties:
-        benchmark_id:
+        task_id:
          type: string
        dataset_id:
          type: string
@ -1554,7 +1673,7 @@ components:
              - type: object
      additionalProperties: false
      required:
-        - benchmark_id
+        - task_id
        - dataset_id
        - scoring_functions
    AppendRowsRequest:
@ -3063,26 +3182,6 @@ components:
        - median
        - categorical_count
        - accuracy
-    AppBenchmarkConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: app
-          default: app
-        eval_candidate:
-          $ref: '#/components/schemas/EvalCandidate'
-        scoring_params:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringFnParams'
-        num_examples:
-          type: integer
-      additionalProperties: false
-      required:
-        - type
-        - eval_candidate
-        - scoring_params
    BasicScoringFnParams:
      type: object
      properties:
@ -3097,21 +3196,21 @@ components:
      additionalProperties: false
      required:
        - type
-    BenchmarkBenchmarkConfig:
+    BenchmarkConfig:
      type: object
      properties:
-        type:
-          type: string
-          const: benchmark
-          default: benchmark
        eval_candidate:
          $ref: '#/components/schemas/EvalCandidate'
+        scoring_params:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ScoringFnParams'
        num_examples:
          type: integer
      additionalProperties: false
      required:
-        - type
        - eval_candidate
+        - scoring_params
    EvalCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
@ -3121,15 +3220,6 @@ components:
        mapping:
          model: '#/components/schemas/ModelCandidate'
          agent: '#/components/schemas/AgentCandidate'
-    BenchmarkConfig:
-      oneOf:
-        - $ref: '#/components/schemas/BenchmarkBenchmarkConfig'
-        - $ref: '#/components/schemas/AppBenchmarkConfig'
-      discriminator:
-        propertyName: type
-        mapping:
-          benchmark: '#/components/schemas/BenchmarkBenchmarkConfig'
-          app: '#/components/schemas/AppBenchmarkConfig'
    LLMAsJudgeScoringFnParams:
      type: object
      properties:
@ -3278,6 +3368,32 @@ components:
      required:
        - score_rows
        - aggregated_results
+    EvaluateRowsDeprecatedRequest:
+      type: object
+      properties:
+        input_rows:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+        scoring_functions:
+          type: array
+          items:
+            type: string
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - input_rows
+        - scoring_functions
+        - task_config
    Session:
      type: object
      properties:
@ -4645,36 +4761,6 @@ components:
      additionalProperties: false
      required:
        - data
-    RegisterDatasetRequest:
-      type: object
-      properties:
-        dataset_id:
-          type: string
-        dataset_schema:
-          type: object
-          additionalProperties:
-            $ref: '#/components/schemas/ParamType'
-        url:
-          $ref: '#/components/schemas/URL'
-        provider_dataset_id:
-          type: string
-        provider_id:
-          type: string
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - dataset_id
-        - dataset_schema
-        - url
    RegisterBenchmarkRequest:
      type: object
      properties:
@ -4705,6 +4791,36 @@ components:
        - benchmark_id
        - dataset_id
        - scoring_functions
+    RegisterDatasetRequest:
+      type: object
+      properties:
+        dataset_id:
+          type: string
+        dataset_schema:
+          type: object
+          additionalProperties:
+            $ref: '#/components/schemas/ParamType'
+        url:
+          $ref: '#/components/schemas/URL'
+        provider_dataset_id:
+          type: string
+        provider_id:
+          type: string
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - dataset_id
+        - dataset_schema
+        - url
    RegisterModelRequest:
      type: object
      properties:
@ -4827,6 +4943,14 @@ components:
      additionalProperties: false
      required:
        - job_id
+    RunEvalDeprecatedRequest:
+      type: object
+      properties:
+        task_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      additionalProperties: false
+      required:
+        - task_config
    RunShieldRequest:
      type: object
      properties:
@ -5125,10 +5249,10 @@ tags:
    x-displayName: >-
      Agents API for creating and interacting with agentic systems.
  - name: BatchInference (Coming Soon)
+  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
  - name: Eval
-  - name: Benchmarks
  - name: Inference
    description: >-
      This API provides the raw interface to the underlying models. Two kinds of models
@ -5159,10 +5283,10 @@ x-tagGroups:
    tags:
      - Agents
      - BatchInference (Coming Soon)
+      - Benchmarks
      - DatasetIO
      - Datasets
      - Eval
-      - Benchmarks
      - Inference
      - Inspect
      - Models
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -83,3 +83,28 @@ class Eval(Protocol):

    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    async def run_eval_DEPRECATED(
+        self,
+        task_id: str,
+        task_config: BenchmarkConfig,
+    ) -> Job: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+    async def evaluate_rows_DEPRECATED(
+        self,
+        task_id: str,
+        input_rows: List[Dict[str, Any]],
+        scoring_functions: List[str],
+        task_config: BenchmarkConfig,
+    ) -> EvaluateResponse: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
+    async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ...
+
+    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
+    async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ...
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -9,7 +9,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
 from llama_stack.apis.common.content_types import InterleavedContent, URL
 from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
 from llama_stack.apis.eval import (
-    AppBenchmarkConfig,
    BenchmarkConfig,
    Eval,
    EvaluateResponse,
@ -348,7 +347,7 @@ class EvalRouter(Eval):
    async def run_eval(
        self,
        benchmark_id: str,
-        task_config: AppBenchmarkConfig,
+        task_config: BenchmarkConfig,
    ) -> Job:
        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
            benchmark_id=benchmark_id,