From 017d24fe6561005b2debdddb5935e4c475629862 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Wed, 12 Feb 2025 20:34:32 -0800
Subject: [PATCH] replace task_id -> benchmark_id

---
 docs/_static/llama-stack-spec.html            | 26 +++---
 docs/_static/llama-stack-spec.yaml            | 26 +++---
 .../Llama_Stack_Benchmark_Evals.ipynb         |  6 +-
 docs/source/building_applications/evals.md    |  6 +-
 .../building_applications/evaluation.md       |  4 +-
 .../references/evals_reference/index.md       |  6 +-
 .../references/python_sdk_reference/index.md  | 10 +--
 llama_stack/apis/eval/eval.py                 | 20 ++---
 llama_stack/apis/eval_tasks/__init__.py       |  7 --
 llama_stack/apis/eval_tasks/eval_tasks.py     | 86 -------------------
 llama_stack/cli/download.py                   | 14 +--
 llama_stack/cli/verify_download.py            |  4 +-
 llama_stack/distribution/routers/routers.py   | 28 +++---
 .../distribution/routers/routing_tables.py    | 10 +--
 .../ui/page/evaluations/native_eval.py        |  2 +-
 .../inline/eval/meta_reference/eval.py        | 16 ++--
 16 files changed, 89 insertions(+), 182 deletions(-)
 delete mode 100644 llama_stack/apis/eval_tasks/__init__.py
 delete mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 84c6fd99d..c656808a6 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -613,7 +613,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/evaluations": {
+        "/v1/eval/tasks/{benchmark_id}/evaluations": {
             "post": {
                 "responses": {
                     "200": {
@@ -633,7 +633,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -811,7 +811,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}": {
+        "/v1/eval/tasks/{benchmark_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -838,7 +838,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1431,7 +1431,7 @@
                 }
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
             "get": {
                 "responses": {
                     "200": {
@@ -1458,7 +1458,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1487,7 +1487,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -1505,7 +1505,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
             "get": {
                 "responses": {
                     "200": {
@@ -1533,7 +1533,7 @@
                         }
                     },
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -2204,7 +2204,7 @@
                 ]
             }
         },
-        "/v1/eval/tasks/{task_id}/jobs": {
+        "/v1/eval/tasks/{benchmark_id}/jobs": {
             "post": {
                 "responses": {
                     "200": {
@@ -2224,7 +2224,7 @@
                 "description": "",
                 "parameters": [
                     {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                         "in": "path",
                         "required": true,
                         "schema": {
@@ -7361,7 +7361,7 @@
             "RegisterBenchmarkRequest": {
                 "type": "object",
                 "properties": {
-                    "task_id": {
+                    "benchmark_id": {
                         "type": "string"
                     },
                     "dataset_id": {
@@ -7407,7 +7407,7 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "task_id",
+                    "benchmark_id",
                     "dataset_id",
                     "scoring_functions"
                 ]
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index dd0951fde..0f0a613a8 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -372,7 +372,7 @@ paths:
             schema:
               $ref: '#/components/schemas/EmbeddingsRequest'
         required: true
-  /v1/eval/tasks/{task_id}/evaluations:
+  /v1/eval/tasks/{benchmark_id}/evaluations:
     post:
       responses:
         '200':
@@ -385,7 +385,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -490,7 +490,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}:
+  /v1/eval/tasks/{benchmark_id}:
     get:
       responses:
         '200':
@@ -505,7 +505,7 @@ paths:
         - Benchmarks
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -852,7 +852,7 @@ paths:
             schema:
               $ref: '#/components/schemas/InvokeToolRequest'
         required: true
-  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
     get:
       responses:
         '200':
@@ -867,7 +867,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -885,7 +885,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -895,7 +895,7 @@ paths:
           required: true
           schema:
             type: string
-  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
     get:
       responses:
         '200':
@@ -913,7 +913,7 @@ paths:
           required: true
           schema:
             type: string
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -1328,7 +1328,7 @@ paths:
             type: array
             items:
               type: string
-  /v1/eval/tasks/{task_id}/jobs:
+  /v1/eval/tasks/{benchmark_id}/jobs:
     post:
       responses:
         '200':
@@ -1341,7 +1341,7 @@ paths:
         - Eval
       description: ''
       parameters:
-        - name: task_id
+        - name: benchmark_id
           in: path
           required: true
           schema:
@@ -4678,7 +4678,7 @@ components:
     RegisterBenchmarkRequest:
       type: object
       properties:
-        task_id:
+        benchmark_id:
           type: string
         dataset_id:
           type: string
@@ -4702,7 +4702,7 @@ components:
               - type: object
       additionalProperties: false
       required:
-        - task_id
+        - benchmark_id
         - dataset_id
         - scoring_functions
     RegisterModelRequest:
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 6e8480f94..599df201a 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1024,7 +1024,7 @@
         ")\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::mmmu\",\n",
+        "    benchmark_id=\"meta-reference::mmmu\",\n",
         "    input_rows=eval_rows,\n",
         "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
         "    task_config={\n",
@@ -1203,7 +1203,7 @@
         ")\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::simpleqa\",\n",
+        "    benchmark_id=\"meta-reference::simpleqa\",\n",
         "    input_rows=eval_rows.rows,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         "    task_config={\n",
@@ -1352,7 +1352,7 @@
         "}\n",
         "\n",
         "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::simpleqa\",\n",
+        "    benchmark_id=\"meta-reference::simpleqa\",\n",
         "    input_rows=eval_rows.rows,\n",
         "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
         "    task_config={\n",
diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index c1c371ca8..f28e0d5fd 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -48,7 +48,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -106,7 +106,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -156,7 +156,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md
index df18c146c..ad220f751 100644
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@@ -18,7 +18,7 @@ response = client.benchmarks.register(
 
 # Run evaluation
 job = client.eval.run_eval(
-    task_id="my_eval",
+    benchmark_id="my_eval",
     task_config={
         "type": "app",
         "eval_candidate": {"type": "agent", "config": agent_config},
@@ -26,5 +26,5 @@ job = client.eval.run_eval(
 )
 
 # Get results
-result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
+result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
 ```
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index f0275511d..71dbb47e5 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -84,7 +84,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
     input_rows=eval_rows,
     scoring_functions=["basic::regex_parser_multiple_choice_answer"],
     task_config={
@@ -142,7 +142,7 @@ client.benchmarks.register(
 )
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
@@ -192,7 +192,7 @@ agent_config = {
 }
 
 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
     input_rows=eval_rows.rows,
     scoring_functions=["llm-as-judge::405b-simpleqa"],
     task_config={
diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md
index eca8c58f5..9d1130422 100644
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
 
 Methods:
 
-- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
 
 ### Jobs
 
@@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
 
 Methods:
 
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
-- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
-- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
 
 ## Inspect
 
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 16b96d618..273ef657c 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel):
 
 
 class Eval(Protocol):
-    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST")
     async def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         task_config: BenchmarkConfig,
     ) -> Job: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+    @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST")
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
         task_config: BenchmarkConfig,
     ) -> EvaluateResponse: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET")
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel(self, task_id: str, job_id: str) -> None: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
 
-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/eval_tasks/__init__.py
deleted file mode 100644
index f8f564957..000000000
--- a/llama_stack/apis/eval_tasks/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .benchmarks import *  # noqa: F401 F403
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
deleted file mode 100644
index 7c8ed8dc0..000000000
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonBenchmarkFields(BaseModel):
-    dataset_id: str
-    scoring_functions: List[str]
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
-    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
-
-    @property
-    def task_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_benchmark_id(self) -> str:
-        return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
-    task_id: str
-    provider_id: Optional[str] = None
-    provider_benchmark_id: Optional[str] = None
-
-
-class ListBenchmarksResponse(BaseModel):
-    data: List[Benchmark]
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
-    @webmethod(route="/eval/tasks", method="GET")
-    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
-
-    @webmethod(route="/eval/tasks/{task_id}", method="GET")
-    async def get_benchmark(
-        self,
-        task_id: str,
-    ) -> Optional[Benchmark]: ...
-
-    @webmethod(route="/eval/tasks", method="POST")
-    async def register_benchmark(
-        self,
-        task_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
-
-    @webmethod(route="/eval-tasks", method="GET")
-    async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ...
-
-    @webmethod(route="/eval-tasks/{benchmark_id}", method="GET")
-    async def DEPRECATED_get_benchmark(
-        self,
-        benchmark_id: str,
-    ) -> Optional[Benchmark]: ...
-
-    @webmethod(route="/eval-tasks", method="POST")
-    async def DEPRECATED_register_benchmark(
-        self,
-        benchmark_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index 379ac49ca..7b9b303f4 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -105,7 +105,7 @@ class DownloadTask:
     output_file: str
     total_size: int = 0
     downloaded_size: int = 0
-    task_id: Optional[int] = None
+    benchmark_id: Optional[int] = None
     retries: int = 0
     max_retries: int = 3
 
@@ -183,8 +183,8 @@ class ParallelDownloader:
                 )
 
             # Update the progress bar's total size once we know it
-            if task.task_id is not None:
-                self.progress.update(task.task_id, total=task.total_size)
+            if task.benchmark_id is not None:
+                self.progress.update(task.benchmark_id, total=task.total_size)
 
         except httpx.HTTPError as e:
             self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
@@ -207,7 +207,7 @@ class ParallelDownloader:
                         file.write(chunk)
                         task.downloaded_size += len(chunk)
                         self.progress.update(
-                            task.task_id,
+                            task.benchmark_id,
                             completed=task.downloaded_size,
                         )
 
@@ -234,7 +234,7 @@ class ParallelDownloader:
                 if os.path.exists(task.output_file):
                     if self.verify_file_integrity(task):
                         self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
-                        self.progress.update(task.task_id, completed=task.total_size)
+                        self.progress.update(task.benchmark_id, completed=task.total_size)
                         return
 
                 await self.prepare_download(task)
@@ -258,7 +258,7 @@ class ParallelDownloader:
                     raise DownloadError(f"Download failed: {str(e)}") from e
 
         except Exception as e:
-            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
+            self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]")
             raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
 
     def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
@@ -293,7 +293,7 @@ class ParallelDownloader:
         with self.progress:
             for task in tasks:
                 desc = f"Downloading {Path(task.output_file).name}"
-                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
+                task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
 
             semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
 
diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
index 47993c361..ca72ca581 100644
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
     ) as progress:
         for filepath, expected_hash in checksums.items():
             full_path = model_dir / filepath
-            task_id = progress.add_task(f"Verifying {filepath}...", total=None)
+            benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None)
 
             exists = full_path.exists()
             actual_hash = None
@@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
                 )
             )
 
-            progress.remove_task(task_id)
+            progress.remove_task(benchmark_id)
 
     return results
 
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 6697b03e2..f9f306767 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -347,23 +347,23 @@ class EvalRouter(Eval):
 
     async def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         task_config: AppBenchmarkConfig,
     ) -> Job:
-        return await self.routing_table.get_provider_impl(task_id).run_eval(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+            benchmark_id=benchmark_id,
             task_config=task_config,
         )
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
         task_config: BenchmarkConfig,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+            benchmark_id=benchmark_id,
             input_rows=input_rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -371,28 +371,28 @@ class EvalRouter(Eval):
 
     async def job_status(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> Optional[JobStatus]:
-        return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
+        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
 
     async def job_cancel(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> None:
-        await self.routing_table.get_provider_impl(task_id).job_cancel(
-            task_id,
+        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+            benchmark_id,
             job_id,
         )
 
     async def job_result(
         self,
-        task_id: str,
+        benchmark_id: str,
         job_id: str,
     ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).job_result(
-            task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+            benchmark_id,
             job_id,
         )
 
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 6c1b06ed6..a52ab7fbd 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
     async def list_benchmarks(self) -> ListBenchmarksResponse:
         return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
 
-    async def get_benchmark(self, task_id: str) -> Optional[Benchmark]:
-        return await self.get_object_by_identifier("benchmark", task_id)
+    async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
+        return await self.get_object_by_identifier("benchmark", benchmark_id)
 
     async def register_benchmark(
         self,
-        task_id: str,
+        benchmark_id: str,
         dataset_id: str,
         scoring_functions: List[str],
         metadata: Optional[Dict[str, Any]] = None,
@@ -455,9 +455,9 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
                     "No provider specified and multiple providers available. Please specify a provider_id."
                 )
         if provider_benchmark_id is None:
-            provider_benchmark_id = task_id
+            provider_benchmark_id = benchmark_id
         benchmark = Benchmark(
-            identifier=task_id,
+            identifier=benchmark_id,
             dataset_id=dataset_id,
             scoring_functions=scoring_functions,
             metadata=metadata,
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index 39385dd14..753c574a2 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -211,7 +211,7 @@ def run_evaluation_3():
             progress_bar.progress(progress, text=progress_text)
             # Run evaluation for current row
             eval_res = llama_stack_api.client.eval.evaluate_rows(
-                task_id=selected_benchmark,
+                benchmark_id=selected_benchmark,
                 input_rows=[r],
                 scoring_functions=benchmarks[selected_benchmark].scoring_functions,
                 task_config=benchmark_config,
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 07310f59c..a02418e74 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -83,10 +83,10 @@ class MetaReferenceEvalImpl(
 
     async def run_eval(
         self,
-        task_id: str,
+        benchmark_id: str,
         task_config: BenchmarkConfig,
     ) -> Job:
-        task_def = self.benchmarks[task_id]
+        task_def = self.benchmarks[benchmark_id]
         dataset_id = task_def.dataset_id
         candidate = task_config.eval_candidate
         scoring_functions = task_def.scoring_functions
@@ -97,7 +97,7 @@ class MetaReferenceEvalImpl(
             rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
         )
         res = await self.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
             input_rows=all_rows.rows,
             scoring_functions=scoring_functions,
             task_config=task_config,
@@ -189,7 +189,7 @@ class MetaReferenceEvalImpl(
 
     async def evaluate_rows(
         self,
-        task_id: str,
+        benchmark_id: str,
         input_rows: List[Dict[str, Any]],
         scoring_functions: List[str],
         task_config: BenchmarkConfig,
@@ -219,17 +219,17 @@ class MetaReferenceEvalImpl(
 
         return EvaluateResponse(generations=generations, scores=score_response.results)
 
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
         if job_id in self.jobs:
             return JobStatus.completed
 
         return None
 
-    async def job_cancel(self, task_id: str, job_id: str) -> None:
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
         raise NotImplementedError("Job cancel is not implemented yet")
 
-    async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
-        status = await self.job_status(task_id, job_id)
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        status = await self.job_status(benchmark_id, job_id)
         if not status or status != JobStatus.completed:
             raise ValueError(f"Job is not completed, Status: {status.value}")