replace task_id -> benchmark_id

2025-08-10 04:08:31 +00:00 · 2025-02-12 20:34:32 -08:00 · 2025-02-12 20:34:32 -08:00 · 017d24fe65
commit 017d24fe65
parent b20742fce7
16 changed files with 89 additions and 182 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -613,7 +613,7 @@
                }
            }
        },
-        "/v1/eval/tasks/{task_id}/evaluations": {
+        "/v1/eval/tasks/{benchmark_id}/evaluations": {
            "post": {
                "responses": {
                    "200": {
@ -633,7 +633,7 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -811,7 +811,7 @@
                ]
            }
        },
-        "/v1/eval/tasks/{task_id}": {
+        "/v1/eval/tasks/{benchmark_id}": {
            "get": {
                "responses": {
                    "200": {
@ -838,7 +838,7 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -1431,7 +1431,7 @@
                }
            }
        },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
            "get": {
                "responses": {
                    "200": {
@ -1458,7 +1458,7 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -1487,7 +1487,7 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -1505,7 +1505,7 @@
                ]
            }
        },
-        "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+        "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
            "get": {
                "responses": {
                    "200": {
@ -1533,7 +1533,7 @@
                        }
                    },
                    {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -2204,7 +2204,7 @@
                ]
            }
        },
-        "/v1/eval/tasks/{task_id}/jobs": {
+        "/v1/eval/tasks/{benchmark_id}/jobs": {
            "post": {
                "responses": {
                    "200": {
@ -2224,7 +2224,7 @@
                "description": "",
                "parameters": [
                    {
-                        "name": "task_id",
+                        "name": "benchmark_id",
                        "in": "path",
                        "required": true,
                        "schema": {
@ -7361,7 +7361,7 @@
            "RegisterBenchmarkRequest": {
                "type": "object",
                "properties": {
-                    "task_id": {
+                    "benchmark_id": {
                        "type": "string"
                    },
                    "dataset_id": {
@ -7407,7 +7407,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "task_id",
+                    "benchmark_id",
                    "dataset_id",
                    "scoring_functions"
                ]
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -372,7 +372,7 @@ paths:
            schema:
              $ref: '#/components/schemas/EmbeddingsRequest'
        required: true
-  /v1/eval/tasks/{task_id}/evaluations:
+  /v1/eval/tasks/{benchmark_id}/evaluations:
    post:
      responses:
        '200':
@ -385,7 +385,7 @@ paths:
        - Eval
      description: ''
      parameters:
-        - name: task_id
+        - name: benchmark_id
          in: path
          required: true
          schema:
@ -490,7 +490,7 @@ paths:
          required: true
          schema:
            type: string
-  /v1/eval/tasks/{task_id}:
+  /v1/eval/tasks/{benchmark_id}:
    get:
      responses:
        '200':
@ -505,7 +505,7 @@ paths:
        - Benchmarks
      description: ''
      parameters:
-        - name: task_id
+        - name: benchmark_id
          in: path
          required: true
          schema:
@ -852,7 +852,7 @@ paths:
            schema:
              $ref: '#/components/schemas/InvokeToolRequest'
        required: true
-  /v1/eval/tasks/{task_id}/jobs/{job_id}:
+  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
    get:
      responses:
        '200':
@ -867,7 +867,7 @@ paths:
        - Eval
      description: ''
      parameters:
-        - name: task_id
+        - name: benchmark_id
          in: path
          required: true
          schema:
@ -885,7 +885,7 @@ paths:
        - Eval
      description: ''
      parameters:
-        - name: task_id
+        - name: benchmark_id
          in: path
          required: true
          schema:
@ -895,7 +895,7 @@ paths:
          required: true
          schema:
            type: string
-  /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+  /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
    get:
      responses:
        '200':
@ -913,7 +913,7 @@ paths:
          required: true
          schema:
            type: string
-        - name: task_id
+        - name: benchmark_id
          in: path
          required: true
          schema:
@ -1328,7 +1328,7 @@ paths:
            type: array
            items:
              type: string
-  /v1/eval/tasks/{task_id}/jobs:
+  /v1/eval/tasks/{benchmark_id}/jobs:
    post:
      responses:
        '200':
@ -1341,7 +1341,7 @@ paths:
        - Eval
      description: ''
      parameters:
-        - name: task_id
+        - name: benchmark_id
          in: path
          required: true
          schema:
@ -4678,7 +4678,7 @@ components:
    RegisterBenchmarkRequest:
      type: object
      properties:
-        task_id:
+        benchmark_id:
          type: string
        dataset_id:
          type: string
@ -4702,7 +4702,7 @@ components:
              - type: object
      additionalProperties: false
      required:
-        - task_id
+        - benchmark_id
        - dataset_id
        - scoring_functions
    RegisterModelRequest:
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -1024,7 +1024,7 @@
        ")\n",
        "\n",
        "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::mmmu\",\n",
+        "    benchmark_id=\"meta-reference::mmmu\",\n",
        "    input_rows=eval_rows,\n",
        "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
        "    task_config={\n",
@ -1203,7 +1203,7 @@
        ")\n",
        "\n",
        "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::simpleqa\",\n",
+        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.rows,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        "    task_config={\n",
@ -1352,7 +1352,7 @@
        "}\n",
        "\n",
        "response = client.eval.evaluate_rows(\n",
-        "    task_id=\"meta-reference::simpleqa\",\n",
+        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.rows,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        "    task_config={\n",
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@ -48,7 +48,7 @@ client.benchmarks.register(
 )

 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
    input_rows=eval_rows,
    scoring_functions=["basic::regex_parser_multiple_choice_answer"],
    task_config={
@ -106,7 +106,7 @@ client.benchmarks.register(
 )

 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
    input_rows=eval_rows.rows,
    scoring_functions=["llm-as-judge::405b-simpleqa"],
    task_config={
@ -156,7 +156,7 @@ agent_config = {
 }

 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
    input_rows=eval_rows.rows,
    scoring_functions=["llm-as-judge::405b-simpleqa"],
    task_config={
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@ -18,7 +18,7 @@ response = client.benchmarks.register(

 # Run evaluation
 job = client.eval.run_eval(
-    task_id="my_eval",
+    benchmark_id="my_eval",
    task_config={
        "type": "app",
        "eval_candidate": {"type": "agent", "config": agent_config},
@ -26,5 +26,5 @@ job = client.eval.run_eval(
 )

 # Get results
-result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
+result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
 ```
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@ -84,7 +84,7 @@ client.benchmarks.register(
 )

 response = client.eval.evaluate_rows(
-    task_id="meta-reference::mmmu",
+    benchmark_id="meta-reference::mmmu",
    input_rows=eval_rows,
    scoring_functions=["basic::regex_parser_multiple_choice_answer"],
    task_config={
@ -142,7 +142,7 @@ client.benchmarks.register(
 )

 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
    input_rows=eval_rows.rows,
    scoring_functions=["llm-as-judge::405b-simpleqa"],
    task_config={
@ -192,7 +192,7 @@ agent_config = {
 }

 response = client.eval.evaluate_rows(
-    task_id="meta-reference::simpleqa",
+    benchmark_id="meta-reference::simpleqa",
    input_rows=eval_rows.rows,
    scoring_functions=["llm-as-judge::405b-simpleqa"],
    task_config={
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job

 Methods:

- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>

 ### Jobs

@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse

 Methods:

- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
+- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
+- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>

 ## Inspect

--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel):


 class Eval(Protocol):
-    @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST")
    async def run_eval(
        self,
-        task_id: str,
+        benchmark_id: str,
        task_config: BenchmarkConfig,
    ) -> Job: ...

-    @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+    @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST")
    async def evaluate_rows(
        self,
-        task_id: str,
+        benchmark_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
        task_config: BenchmarkConfig,
    ) -> EvaluateResponse: ...

-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET")
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...

-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
-    async def job_cancel(self, task_id: str, job_id: str) -> None: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...

-    @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
-    async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
+    @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ...
--- a/llama_stack/apis/eval_tasks/init.py
+++ b/llama_stack/apis/eval_tasks/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .benchmarks import *  # noqa: F401 F403
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ b/llama_stack/apis/eval_tasks/eval_tasks.py
@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonBenchmarkFields(BaseModel):
-    dataset_id: str
-    scoring_functions: List[str]
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
-    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
-
-    @property
-    def task_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_benchmark_id(self) -> str:
-        return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
-    task_id: str
-    provider_id: Optional[str] = None
-    provider_benchmark_id: Optional[str] = None
-
-
-class ListBenchmarksResponse(BaseModel):
-    data: List[Benchmark]
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
-    @webmethod(route="/eval/tasks", method="GET")
-    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
-
-    @webmethod(route="/eval/tasks/{task_id}", method="GET")
-    async def get_benchmark(
-        self,
-        task_id: str,
-    ) -> Optional[Benchmark]: ...
-
-    @webmethod(route="/eval/tasks", method="POST")
-    async def register_benchmark(
-        self,
-        task_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
-
-    @webmethod(route="/eval-tasks", method="GET")
-    async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ...
-
-    @webmethod(route="/eval-tasks/{benchmark_id}", method="GET")
-    async def DEPRECATED_get_benchmark(
-        self,
-        benchmark_id: str,
-    ) -> Optional[Benchmark]: ...
-
-    @webmethod(route="/eval-tasks", method="POST")
-    async def DEPRECATED_register_benchmark(
-        self,
-        benchmark_id: str,
-        dataset_id: str,
-        scoring_functions: List[str],
-        provider_benchmark_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-    ) -> None: ...
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -105,7 +105,7 @@ class DownloadTask:
    output_file: str
    total_size: int = 0
    downloaded_size: int = 0
-    task_id: Optional[int] = None
+    benchmark_id: Optional[int] = None
    retries: int = 0
    max_retries: int = 3

@ -183,8 +183,8 @@ class ParallelDownloader:
                )

            # Update the progress bar's total size once we know it
-            if task.task_id is not None:
-                self.progress.update(task.task_id, total=task.total_size)
+            if task.benchmark_id is not None:
+                self.progress.update(task.benchmark_id, total=task.total_size)

        except httpx.HTTPError as e:
            self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
@ -207,7 +207,7 @@ class ParallelDownloader:
                        file.write(chunk)
                        task.downloaded_size += len(chunk)
                        self.progress.update(
-                            task.task_id,
+                            task.benchmark_id,
                            completed=task.downloaded_size,
                        )

@ -234,7 +234,7 @@ class ParallelDownloader:
                if os.path.exists(task.output_file):
                    if self.verify_file_integrity(task):
                        self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
-                        self.progress.update(task.task_id, completed=task.total_size)
+                        self.progress.update(task.benchmark_id, completed=task.total_size)
                        return

                await self.prepare_download(task)
@ -258,7 +258,7 @@ class ParallelDownloader:
                    raise DownloadError(f"Download failed: {str(e)}") from e

        except Exception as e:
-            self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
+            self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]")
            raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e

    def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
@ -293,7 +293,7 @@ class ParallelDownloader:
        with self.progress:
            for task in tasks:
                desc = f"Downloading {Path(task.output_file).name}"
-                task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
+                task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)

            semaphore = asyncio.Semaphore(self.max_concurrent_downloads)

--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
    ) as progress:
        for filepath, expected_hash in checksums.items():
            full_path = model_dir / filepath
-            task_id = progress.add_task(f"Verifying {filepath}...", total=None)
+            benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None)

            exists = full_path.exists()
            actual_hash = None
@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
                )
            )

-            progress.remove_task(task_id)
+            progress.remove_task(benchmark_id)

    return results

--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -347,23 +347,23 @@ class EvalRouter(Eval):

    async def run_eval(
        self,
-        task_id: str,
+        benchmark_id: str,
        task_config: AppBenchmarkConfig,
    ) -> Job:
-        return await self.routing_table.get_provider_impl(task_id).run_eval(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+            benchmark_id=benchmark_id,
            task_config=task_config,
        )

    async def evaluate_rows(
        self,
-        task_id: str,
+        benchmark_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
        task_config: BenchmarkConfig,
    ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
-            task_id=task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+            benchmark_id=benchmark_id,
            input_rows=input_rows,
            scoring_functions=scoring_functions,
            task_config=task_config,
@ -371,28 +371,28 @@ class EvalRouter(Eval):

    async def job_status(
        self,
-        task_id: str,
+        benchmark_id: str,
        job_id: str,
    ) -> Optional[JobStatus]:
-        return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
+        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)

    async def job_cancel(
        self,
-        task_id: str,
+        benchmark_id: str,
        job_id: str,
    ) -> None:
-        await self.routing_table.get_provider_impl(task_id).job_cancel(
-            task_id,
+        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+            benchmark_id,
            job_id,
        )

    async def job_result(
        self,
-        task_id: str,
+        benchmark_id: str,
        job_id: str,
    ) -> EvaluateResponse:
-        return await self.routing_table.get_provider_impl(task_id).job_result(
-            task_id,
+        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+            benchmark_id,
            job_id,
        )

--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
    async def list_benchmarks(self) -> ListBenchmarksResponse:
        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))

-    async def get_benchmark(self, task_id: str) -> Optional[Benchmark]:
-        return await self.get_object_by_identifier("benchmark", task_id)
+    async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
+        return await self.get_object_by_identifier("benchmark", benchmark_id)

    async def register_benchmark(
        self,
-        task_id: str,
+        benchmark_id: str,
        dataset_id: str,
        scoring_functions: List[str],
        metadata: Optional[Dict[str, Any]] = None,
@ -455,9 +455,9 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
                    "No provider specified and multiple providers available. Please specify a provider_id."
                )
        if provider_benchmark_id is None:
-            provider_benchmark_id = task_id
+            provider_benchmark_id = benchmark_id
        benchmark = Benchmark(
-            identifier=task_id,
+            identifier=benchmark_id,
            dataset_id=dataset_id,
            scoring_functions=scoring_functions,
            metadata=metadata,
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@ -211,7 +211,7 @@ def run_evaluation_3():
            progress_bar.progress(progress, text=progress_text)
            # Run evaluation for current row
            eval_res = llama_stack_api.client.eval.evaluate_rows(
-                task_id=selected_benchmark,
+                benchmark_id=selected_benchmark,
                input_rows=[r],
                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
                task_config=benchmark_config,
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -83,10 +83,10 @@ class MetaReferenceEvalImpl(

    async def run_eval(
        self,
-        task_id: str,
+        benchmark_id: str,
        task_config: BenchmarkConfig,
    ) -> Job:
-        task_def = self.benchmarks[task_id]
+        task_def = self.benchmarks[benchmark_id]
        dataset_id = task_def.dataset_id
        candidate = task_config.eval_candidate
        scoring_functions = task_def.scoring_functions
@ -97,7 +97,7 @@ class MetaReferenceEvalImpl(
            rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
        )
        res = await self.evaluate_rows(
-            task_id=task_id,
+            benchmark_id=benchmark_id,
            input_rows=all_rows.rows,
            scoring_functions=scoring_functions,
            task_config=task_config,
@ -189,7 +189,7 @@ class MetaReferenceEvalImpl(

    async def evaluate_rows(
        self,
-        task_id: str,
+        benchmark_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
        task_config: BenchmarkConfig,
@ -219,17 +219,17 @@ class MetaReferenceEvalImpl(

        return EvaluateResponse(generations=generations, scores=score_response.results)

-    async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
        if job_id in self.jobs:
            return JobStatus.completed

        return None

-    async def job_cancel(self, task_id: str, job_id: str) -> None:
+    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
        raise NotImplementedError("Job cancel is not implemented yet")

-    async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
-        status = await self.job_status(task_id, job_id)
+    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+        status = await self.job_status(benchmark_id, job_id)
        if not status or status != JobStatus.completed:
            raise ValueError(f"Job is not completed, Status: {status.value}")