update

2025-08-12 13:00:39 +00:00 · 2025-02-12 20:48:05 -08:00 · 2025-02-12 20:48:05 -08:00 · ec721b3867
commit ec721b3867
parent e07776fff6
4 changed files with 695 additions and 334 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -67,8 +67,8 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "benchmark_id",
+                        "name": "task_id",
-                        "in": "path",
+                        "in": "query",
                        "required": true,
                        "schema": {
                            "type": "string"
@ -114,7 +114,7 @@
                    "content": {
                        "application/json": {
                            "schema": {
-                                "$ref": "#/components/schemas/DeprecatedRegisterBenchmarkRequest"
+                                "$ref": "#/components/schemas/DeprecatedRegisterEvalTaskRequest"
                            }
                        }
                    },
@ -613,7 +613,7 @@
                }
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/evaluations": {
+        "/v1/eval/benchmarks/{benchmark_id}/evaluations": {
            "post": {
                "responses": {
                    "200": {
@ -653,6 +653,47 @@
                }
            }
        },
        "/v1/eval/tasks/{task_id}/evaluations": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/EvaluateResponse"
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "",
                "parameters": [
                    {
                        "name": "task_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/EvaluateRowsDeprecatedRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": true
            }
        },
        "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": {
            "get": {
                "responses": {
@ -753,6 +794,43 @@
                ]
            }
        },
        "/v1/eval/benchmarks/{benchmark_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "oneOf": [
                                        {
                                            "$ref": "#/components/schemas/Benchmark"
                                        },
                                        {
                                            "type": "null"
                                        }
                                    ]
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Benchmarks"
                ],
                "description": "",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/datasets/{dataset_id}": {
            "get": {
                "responses": {
@ -811,43 +889,6 @@
                ]
            }
        },
        "/v1/eval/tasks/{benchmark_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "oneOf": [
                                        {
                                            "$ref": "#/components/schemas/Benchmark"
                                        },
                                        {
                                            "type": "null"
                                        }
                                    ]
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Benchmarks"
                ],
                "description": "",
                "parameters": [
                    {
                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/models/{model_id}": {
            "get": {
                "responses": {
@ -1431,7 +1472,7 @@
                }
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
@ -1505,7 +1546,83 @@
                ]
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
+        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "oneOf": [
                                        {
                                            "$ref": "#/components/schemas/JobStatus"
                                        },
                                        {
                                            "type": "null"
                                        }
                                    ]
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "",
                "parameters": [
                    {
                        "name": "task_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": true
            },
            "delete": {
                "responses": {
                    "200": {
                        "description": "OK"
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "",
                "parameters": [
                    {
                        "name": "task_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": true
            }
        },
        "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": {
            "get": {
                "responses": {
                    "200": {
@ -1525,7 +1642,7 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "job_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -1533,7 +1650,7 @@
                        }
                    },
                    {
-                        "name": "benchmark_id",
+                        "name": "job_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -1543,6 +1660,88 @@
                ]
            }
        },
        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
            "get": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/EvaluateResponse"
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "",
                "parameters": [
                    {
                        "name": "task_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "job_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": true
            }
        },
        "/v1/eval/benchmarks": {
            "get": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Benchmarks"
                ],
                "description": "",
                "parameters": []
            },
            "post": {
                "responses": {
                    "200": {
                        "description": "OK"
                    }
                },
                "tags": [
                    "Benchmarks"
                ],
                "description": "",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/datasets": {
            "get": {
                "responses": {
@ -1586,49 +1785,6 @@
                }
            }
        },
        "/v1/eval/tasks": {
            "get": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ListBenchmarksResponse"
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Benchmarks"
                ],
                "description": "",
                "parameters": []
            },
            "post": {
                "responses": {
                    "200": {
                        "description": "OK"
                    }
                },
                "tags": [
                    "Benchmarks"
                ],
                "description": "",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/RegisterBenchmarkRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/models": {
            "get": {
                "responses": {
@ -2204,7 +2360,7 @@
                ]
            }
        },
-        "/v1/eval/tasks/{benchmark_id}/jobs": {
+        "/v1/eval/benchmarks/{benchmark_id}/jobs": {
            "post": {
                "responses": {
                    "200": {
@ -2244,6 +2400,47 @@
                }
            }
        },
        "/v1/eval/tasks/{task_id}/jobs": {
            "post": {
                "responses": {
                    "200": {
                        "description": "OK",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Job"
                                }
                            }
                        }
                    }
                },
                "tags": [
                    "Eval"
                ],
                "description": "",
                "parameters": [
                    {
                        "name": "task_id",
                        "in": "path",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/RunEvalDeprecatedRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": true
            }
        },
        "/v1/safety/run-shield": {
            "post": {
                "responses": {
@ -2526,10 +2723,10 @@
                    "data"
                ]
            },
-            "DeprecatedRegisterBenchmarkRequest": {
+            "DeprecatedRegisterEvalTaskRequest": {
                "type": "object",
                "properties": {
-                    "benchmark_id": {
+                    "task_id": {
                        "type": "string"
                    },
                    "dataset_id": {
@ -2575,7 +2772,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "benchmark_id",
+                    "task_id",
                    "dataset_id",
                    "scoring_functions"
                ]
@ -4745,34 +4942,6 @@
                    "accuracy"
                ]
            },
            "AppBenchmarkConfig": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "app",
                        "default": "app"
                    },
                    "eval_candidate": {
                        "$ref": "#/components/schemas/EvalCandidate"
                    },
                    "scoring_params": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringFnParams"
                        }
                    },
                    "num_examples": {
                        "type": "integer"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type",
                    "eval_candidate",
                    "scoring_params"
                ]
            },
            "BasicScoringFnParams": {
                "type": "object",
                "properties": {
@ -4793,25 +4962,26 @@
                    "type"
                ]
            },
-            "BenchmarkBenchmarkConfig": {
+            "BenchmarkConfig": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "benchmark",
                        "default": "benchmark"
                    },
                    "eval_candidate": {
                        "$ref": "#/components/schemas/EvalCandidate"
                    },
                    "scoring_params": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ScoringFnParams"
                        }
                    },
                    "num_examples": {
                        "type": "integer"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "type",
+                    "eval_candidate",
-                    "eval_candidate"
+                    "scoring_params"
                ]
            },
            "EvalCandidate": {
@ -4831,23 +5001,6 @@
                    }
                }
            },
            "BenchmarkConfig": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/BenchmarkBenchmarkConfig"
                    },
                    {
                        "$ref": "#/components/schemas/AppBenchmarkConfig"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "benchmark": "#/components/schemas/BenchmarkBenchmarkConfig",
                        "app": "#/components/schemas/AppBenchmarkConfig"
                    }
                }
            },
            "LLMAsJudgeScoringFnParams": {
                "type": "object",
                "properties": {
@ -5108,6 +5261,54 @@
                    "aggregated_results"
                ]
            },
            "EvaluateRowsDeprecatedRequest": {
                "type": "object",
                "properties": {
                    "input_rows": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": {
                                "oneOf": [
                                    {
                                        "type": "null"
                                    },
                                    {
                                        "type": "boolean"
                                    },
                                    {
                                        "type": "number"
                                    },
                                    {
                                        "type": "string"
                                    },
                                    {
                                        "type": "array"
                                    },
                                    {
                                        "type": "object"
                                    }
                                ]
                            }
                        }
                    },
                    "scoring_functions": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    },
                    "task_config": {
                        "$ref": "#/components/schemas/BenchmarkConfig"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input_rows",
                    "scoring_functions",
                    "task_config"
                ]
            },
            "Session": {
                "type": "object",
                "properties": {
@ -7304,60 +7505,6 @@
                    "data"
                ]
            },
            "RegisterDatasetRequest": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "dataset_schema": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ParamType"
                        }
                    },
                    "url": {
                        "$ref": "#/components/schemas/URL"
                    },
                    "provider_dataset_id": {
                        "type": "string"
                    },
                    "provider_id": {
                        "type": "string"
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "dataset_id",
                    "dataset_schema",
                    "url"
                ]
            },
            "RegisterBenchmarkRequest": {
                "type": "object",
                "properties": {
@ -7412,6 +7559,60 @@
                    "scoring_functions"
                ]
            },
            "RegisterDatasetRequest": {
                "type": "object",
                "properties": {
                    "dataset_id": {
                        "type": "string"
                    },
                    "dataset_schema": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/ParamType"
                        }
                    },
                    "url": {
                        "$ref": "#/components/schemas/URL"
                    },
                    "provider_dataset_id": {
                        "type": "string"
                    },
                    "provider_id": {
                        "type": "string"
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "oneOf": [
                                {
                                    "type": "null"
                                },
                                {
                                    "type": "boolean"
                                },
                                {
                                    "type": "number"
                                },
                                {
                                    "type": "string"
                                },
                                {
                                    "type": "array"
                                },
                                {
                                    "type": "object"
                                }
                            ]
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "dataset_id",
                    "dataset_schema",
                    "url"
                ]
            },
            "RegisterModelRequest": {
                "type": "object",
                "properties": {
@ -7623,6 +7824,18 @@
                    "job_id"
                ]
            },
            "RunEvalDeprecatedRequest": {
                "type": "object",
                "properties": {
                    "task_config": {
                        "$ref": "#/components/schemas/BenchmarkConfig"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "task_config"
                ]
            },
            "RunShieldRequest": {
                "type": "object",
                "properties": {
@ -8105,6 +8318,9 @@
        {
            "name": "BatchInference (Coming Soon)"
        },
        {
            "name": "Benchmarks"
        },
        {
            "name": "DatasetIO"
        },
@ -8114,9 +8330,6 @@
        {
            "name": "Eval"
        },
        {
            "name": "Benchmarks"
        },
        {
            "name": "Inference",
            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
@ -8168,10 +8381,10 @@
            "tags": [
                "Agents",
                "BatchInference (Coming Soon)",
                "Benchmarks",
                "DatasetIO",
                "Datasets",
                "Eval",
                "Benchmarks",
                "Inference",
                "Inspect",
                "Models",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -25,8 +25,8 @@ paths:
        - Benchmarks
      description: ''
      parameters:
-        - name: benchmark_id
+        - name: task_id
-          in: path
+          in: query
          required: true
          schema:
            type: string
@ -57,7 +57,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/DeprecatedRegisterBenchmarkRequest'
+              $ref: '#/components/schemas/DeprecatedRegisterEvalTaskRequest'
        required: true
      deprecated: true
  /v1/datasetio/rows:
@ -372,7 +372,7 @@ paths:
            schema:
              $ref: '#/components/schemas/EmbeddingsRequest'
        required: true
-  /v1/eval/tasks/{benchmark_id}/evaluations:
+  /v1/eval/benchmarks/{benchmark_id}/evaluations:
    post:
      responses:
        '200':
@ -396,6 +396,31 @@ paths:
            schema:
              $ref: '#/components/schemas/EvaluateRowsRequest'
        required: true
  /v1/eval/tasks/{task_id}/evaluations:
    post:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluateResponse'
      tags:
        - Eval
      description: ''
      parameters:
        - name: task_id
          in: path
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/EvaluateRowsDeprecatedRequest'
        required: true
      deprecated: true
  /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}:
    get:
      responses:
@ -457,6 +482,26 @@ paths:
          required: true
          schema:
            type: string
  /v1/eval/benchmarks/{benchmark_id}:
    get:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/Benchmark'
                  - type: 'null'
      tags:
        - Benchmarks
      description: ''
      parameters:
        - name: benchmark_id
          in: path
          required: true
          schema:
            type: string
  /v1/datasets/{dataset_id}:
    get:
      responses:
@ -490,26 +535,6 @@ paths:
          required: true
          schema:
            type: string
  /v1/eval/tasks/{benchmark_id}:
    get:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/Benchmark'
                  - type: 'null'
      tags:
        - Benchmarks
      description: ''
      parameters:
        - name: benchmark_id
          in: path
          required: true
          schema:
            type: string
  /v1/models/{model_id}:
    get:
      responses:
@ -852,7 +877,7 @@ paths:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
+  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
@ -895,7 +920,52 @@ paths:
          required: true
          schema:
            type: string
-  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
+  /v1/eval/tasks/{task_id}/jobs/{job_id}:
    get:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/JobStatus'
                  - type: 'null'
      tags:
        - Eval
      description: ''
      parameters:
        - name: task_id
          in: path
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          required: true
          schema:
            type: string
      deprecated: true
    delete:
      responses:
        '200':
          description: OK
      tags:
        - Eval
      description: ''
      parameters:
        - name: task_id
          in: path
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          required: true
          schema:
            type: string
      deprecated: true
  /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
    get:
      responses:
        '200':
@ -908,16 +978,67 @@ paths:
        - Eval
      description: ''
      parameters:
        - name: job_id
          in: path
          required: true
          schema:
            type: string
        - name: benchmark_id
          in: path
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          required: true
          schema:
            type: string
  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
    get:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EvaluateResponse'
      tags:
        - Eval
      description: ''
      parameters:
        - name: task_id
          in: path
          required: true
          schema:
            type: string
        - name: job_id
          in: path
          required: true
          schema:
            type: string
      deprecated: true
  /v1/eval/benchmarks:
    get:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListBenchmarksResponse'
      tags:
        - Benchmarks
      description: ''
      parameters: []
    post:
      responses:
        '200':
          description: OK
      tags:
        - Benchmarks
      description: ''
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RegisterBenchmarkRequest'
        required: true
  /v1/datasets:
    get:
      responses:
@ -945,33 +1066,6 @@ paths:
            schema:
              $ref: '#/components/schemas/RegisterDatasetRequest'
        required: true
  /v1/eval/tasks:
    get:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListBenchmarksResponse'
      tags:
        - Benchmarks
      description: ''
      parameters: []
    post:
      responses:
        '200':
          description: OK
      tags:
        - Benchmarks
      description: ''
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RegisterBenchmarkRequest'
        required: true
  /v1/models:
    get:
      responses:
@ -1328,7 +1422,7 @@ paths:
            type: array
            items:
              type: string
-  /v1/eval/tasks/{benchmark_id}/jobs:
+  /v1/eval/benchmarks/{benchmark_id}/jobs:
    post:
      responses:
        '200':
@ -1352,6 +1446,31 @@ paths:
            schema:
              $ref: '#/components/schemas/RunEvalRequest'
        required: true
  /v1/eval/tasks/{task_id}/jobs:
    post:
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Job'
      tags:
        - Eval
      description: ''
      parameters:
        - name: task_id
          in: path
          required: true
          schema:
            type: string
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/RunEvalDeprecatedRequest'
        required: true
      deprecated: true
  /v1/safety/run-shield:
    post:
      responses:
@ -1527,10 +1646,10 @@ components:
      additionalProperties: false
      required:
        - data
-    DeprecatedRegisterBenchmarkRequest:
+    DeprecatedRegisterEvalTaskRequest:
      type: object
      properties:
-        benchmark_id:
+        task_id:
          type: string
        dataset_id:
          type: string
@ -1554,7 +1673,7 @@ components:
              - type: object
      additionalProperties: false
      required:
-        - benchmark_id
+        - task_id
        - dataset_id
        - scoring_functions
    AppendRowsRequest:
@ -3063,26 +3182,6 @@ components:
        - median
        - categorical_count
        - accuracy
    AppBenchmarkConfig:
      type: object
      properties:
        type:
          type: string
          const: app
          default: app
        eval_candidate:
          $ref: '#/components/schemas/EvalCandidate'
        scoring_params:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringFnParams'
        num_examples:
          type: integer
      additionalProperties: false
      required:
        - type
        - eval_candidate
        - scoring_params
    BasicScoringFnParams:
      type: object
      properties:
@ -3097,21 +3196,21 @@ components:
      additionalProperties: false
      required:
        - type
-    BenchmarkBenchmarkConfig:
+    BenchmarkConfig:
      type: object
      properties:
        type:
          type: string
          const: benchmark
          default: benchmark
        eval_candidate:
          $ref: '#/components/schemas/EvalCandidate'
        scoring_params:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ScoringFnParams'
        num_examples:
          type: integer
      additionalProperties: false
      required:
        - type
        - eval_candidate
        - scoring_params
    EvalCandidate:
      oneOf:
        - $ref: '#/components/schemas/ModelCandidate'
@ -3121,15 +3220,6 @@ components:
        mapping:
          model: '#/components/schemas/ModelCandidate'
          agent: '#/components/schemas/AgentCandidate'
    BenchmarkConfig:
      oneOf:
        - $ref: '#/components/schemas/BenchmarkBenchmarkConfig'
        - $ref: '#/components/schemas/AppBenchmarkConfig'
      discriminator:
        propertyName: type
        mapping:
          benchmark: '#/components/schemas/BenchmarkBenchmarkConfig'
          app: '#/components/schemas/AppBenchmarkConfig'
    LLMAsJudgeScoringFnParams:
      type: object
      properties:
@ -3278,6 +3368,32 @@ components:
      required:
        - score_rows
        - aggregated_results
    EvaluateRowsDeprecatedRequest:
      type: object
      properties:
        input_rows:
          type: array
          items:
            type: object
            additionalProperties:
              oneOf:
                - type: 'null'
                - type: boolean
                - type: number
                - type: string
                - type: array
                - type: object
        scoring_functions:
          type: array
          items:
            type: string
        task_config:
          $ref: '#/components/schemas/BenchmarkConfig'
      additionalProperties: false
      required:
        - input_rows
        - scoring_functions
        - task_config
    Session:
      type: object
      properties:
@ -4645,36 +4761,6 @@ components:
      additionalProperties: false
      required:
        - data
    RegisterDatasetRequest:
      type: object
      properties:
        dataset_id:
          type: string
        dataset_schema:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ParamType'
        url:
          $ref: '#/components/schemas/URL'
        provider_dataset_id:
          type: string
        provider_id:
          type: string
        metadata:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - dataset_id
        - dataset_schema
        - url
    RegisterBenchmarkRequest:
      type: object
      properties:
@ -4705,6 +4791,36 @@ components:
        - benchmark_id
        - dataset_id
        - scoring_functions
    RegisterDatasetRequest:
      type: object
      properties:
        dataset_id:
          type: string
        dataset_schema:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/ParamType'
        url:
          $ref: '#/components/schemas/URL'
        provider_dataset_id:
          type: string
        provider_id:
          type: string
        metadata:
          type: object
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
      additionalProperties: false
      required:
        - dataset_id
        - dataset_schema
        - url
    RegisterModelRequest:
      type: object
      properties:
@ -4827,6 +4943,14 @@ components:
      additionalProperties: false
      required:
        - job_id
    RunEvalDeprecatedRequest:
      type: object
      properties:
        task_config:
          $ref: '#/components/schemas/BenchmarkConfig'
      additionalProperties: false
      required:
        - task_config
    RunShieldRequest:
      type: object
      properties:
@ -5125,10 +5249,10 @@ tags:
    x-displayName: >-
      Agents API for creating and interacting with agentic systems.
  - name: BatchInference (Coming Soon)
  - name: Benchmarks
  - name: DatasetIO
  - name: Datasets
  - name: Eval
  - name: Benchmarks
  - name: Inference
    description: >-
      This API provides the raw interface to the underlying models. Two kinds of models
@ -5159,10 +5283,10 @@ x-tagGroups:
    tags:
      - Agents
      - BatchInference (Coming Soon)
      - Benchmarks
      - DatasetIO
      - Datasets
      - Eval
      - Benchmarks
      - Inference
      - Inspect
      - Models
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -83,3 +83,28 @@ class Eval(Protocol):
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
    async def run_eval_DEPRECATED(
        self,
        task_id: str,
        task_config: BenchmarkConfig,
    ) -> Job: ...
    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
    async def evaluate_rows_DEPRECATED(
        self,
        task_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
        task_config: BenchmarkConfig,
    ) -> EvaluateResponse: ...
    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
    async def job_status_DEPRECATED(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
    async def job_cancel_DEPRECATED(self, task_id: str, job_id: str) -> None: ...
    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
    async def job_result_DEPRECATED(self, task_id: str, job_id: str) -> EvaluateResponse: ...
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -9,7 +9,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
 from llama_stack.apis.common.content_types import InterleavedContent, URL
 from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
 from llama_stack.apis.eval import (
    AppBenchmarkConfig,
    BenchmarkConfig,
    Eval,
    EvaluateResponse,
@ -348,7 +347,7 @@ class EvalRouter(Eval):
    async def run_eval(
        self,
        benchmark_id: str,
-        task_config: AppBenchmarkConfig,
+        task_config: BenchmarkConfig,
    ) -> Job:
        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
            benchmark_id=benchmark_id,