diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 84c6fd99d..c656808a6 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -613,7 +613,7 @@
}
}
},
- "/v1/eval/tasks/{task_id}/evaluations": {
+ "/v1/eval/tasks/{benchmark_id}/evaluations": {
"post": {
"responses": {
"200": {
@@ -633,7 +633,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -811,7 +811,7 @@
]
}
},
- "/v1/eval/tasks/{task_id}": {
+ "/v1/eval/tasks/{benchmark_id}": {
"get": {
"responses": {
"200": {
@@ -838,7 +838,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -1431,7 +1431,7 @@
}
}
},
- "/v1/eval/tasks/{task_id}/jobs/{job_id}": {
+ "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
"get": {
"responses": {
"200": {
@@ -1458,7 +1458,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -1487,7 +1487,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -1505,7 +1505,7 @@
]
}
},
- "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
+ "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
"get": {
"responses": {
"200": {
@@ -1533,7 +1533,7 @@
}
},
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -2204,7 +2204,7 @@
]
}
},
- "/v1/eval/tasks/{task_id}/jobs": {
+ "/v1/eval/tasks/{benchmark_id}/jobs": {
"post": {
"responses": {
"200": {
@@ -2224,7 +2224,7 @@
"description": "",
"parameters": [
{
- "name": "task_id",
+ "name": "benchmark_id",
"in": "path",
"required": true,
"schema": {
@@ -7361,7 +7361,7 @@
"RegisterBenchmarkRequest": {
"type": "object",
"properties": {
- "task_id": {
+ "benchmark_id": {
"type": "string"
},
"dataset_id": {
@@ -7407,7 +7407,7 @@
},
"additionalProperties": false,
"required": [
- "task_id",
+ "benchmark_id",
"dataset_id",
"scoring_functions"
]
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index dd0951fde..0f0a613a8 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -372,7 +372,7 @@ paths:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
- /v1/eval/tasks/{task_id}/evaluations:
+ /v1/eval/tasks/{benchmark_id}/evaluations:
post:
responses:
'200':
@@ -385,7 +385,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -490,7 +490,7 @@ paths:
required: true
schema:
type: string
- /v1/eval/tasks/{task_id}:
+ /v1/eval/tasks/{benchmark_id}:
get:
responses:
'200':
@@ -505,7 +505,7 @@ paths:
- Benchmarks
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -852,7 +852,7 @@ paths:
schema:
$ref: '#/components/schemas/InvokeToolRequest'
required: true
- /v1/eval/tasks/{task_id}/jobs/{job_id}:
+ /v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
get:
responses:
'200':
@@ -867,7 +867,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -885,7 +885,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -895,7 +895,7 @@ paths:
required: true
schema:
type: string
- /v1/eval/tasks/{task_id}/jobs/{job_id}/result:
+ /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
get:
responses:
'200':
@@ -913,7 +913,7 @@ paths:
required: true
schema:
type: string
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -1328,7 +1328,7 @@ paths:
type: array
items:
type: string
- /v1/eval/tasks/{task_id}/jobs:
+ /v1/eval/tasks/{benchmark_id}/jobs:
post:
responses:
'200':
@@ -1341,7 +1341,7 @@ paths:
- Eval
description: ''
parameters:
- - name: task_id
+ - name: benchmark_id
in: path
required: true
schema:
@@ -4678,7 +4678,7 @@ components:
RegisterBenchmarkRequest:
type: object
properties:
- task_id:
+ benchmark_id:
type: string
dataset_id:
type: string
@@ -4702,7 +4702,7 @@ components:
- type: object
additionalProperties: false
required:
- - task_id
+ - benchmark_id
- dataset_id
- scoring_functions
RegisterModelRequest:
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 6e8480f94..599df201a 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -1024,7 +1024,7 @@
")\n",
"\n",
"response = client.eval.evaluate_rows(\n",
- " task_id=\"meta-reference::mmmu\",\n",
+ " benchmark_id=\"meta-reference::mmmu\",\n",
" input_rows=eval_rows,\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
" task_config={\n",
@@ -1203,7 +1203,7 @@
")\n",
"\n",
"response = client.eval.evaluate_rows(\n",
- " task_id=\"meta-reference::simpleqa\",\n",
+ " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.rows,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" task_config={\n",
@@ -1352,7 +1352,7 @@
"}\n",
"\n",
"response = client.eval.evaluate_rows(\n",
- " task_id=\"meta-reference::simpleqa\",\n",
+ " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.rows,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" task_config={\n",
diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index c1c371ca8..f28e0d5fd 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -48,7 +48,7 @@ client.benchmarks.register(
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::mmmu",
+ benchmark_id="meta-reference::mmmu",
input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={
@@ -106,7 +106,7 @@ client.benchmarks.register(
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@@ -156,7 +156,7 @@ agent_config = {
}
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md
index df18c146c..ad220f751 100644
--- a/docs/source/building_applications/evaluation.md
+++ b/docs/source/building_applications/evaluation.md
@@ -18,7 +18,7 @@ response = client.benchmarks.register(
# Run evaluation
job = client.eval.run_eval(
- task_id="my_eval",
+ benchmark_id="my_eval",
task_config={
"type": "app",
"eval_candidate": {"type": "agent", "config": agent_config},
@@ -26,5 +26,5 @@ job = client.eval.run_eval(
)
# Get results
-result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
+result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
```
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index f0275511d..71dbb47e5 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -84,7 +84,7 @@ client.benchmarks.register(
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::mmmu",
+ benchmark_id="meta-reference::mmmu",
input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={
@@ -142,7 +142,7 @@ client.benchmarks.register(
)
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@@ -192,7 +192,7 @@ agent_config = {
}
response = client.eval.evaluate_rows(
- task_id="meta-reference::simpleqa",
+ benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md
index eca8c58f5..9d1130422 100644
--- a/docs/source/references/python_sdk_reference/index.md
+++ b/docs/source/references/python_sdk_reference/index.md
@@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
Methods:
-- client.eval.evaluate_rows(task_id, \*\*params) -> EvaluateResponse
-- client.eval.run_eval(task_id, \*\*params) -> Job
+- client.eval.evaluate_rows(benchmark_id, \*\*params) -> EvaluateResponse
+- client.eval.run_eval(benchmark_id, \*\*params) -> Job
### Jobs
@@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
Methods:
-- client.eval.jobs.retrieve(job_id, \*, task_id) -> EvaluateResponse
-- client.eval.jobs.cancel(job_id, \*, task_id) -> None
-- client.eval.jobs.status(job_id, \*, task_id) -> Optional[JobStatusResponse]
+- client.eval.jobs.retrieve(job_id, \*, benchmark_id) -> EvaluateResponse
+- client.eval.jobs.cancel(job_id, \*, benchmark_id) -> None
+- client.eval.jobs.status(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]
## Inspect
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 16b96d618..273ef657c 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel):
class Eval(Protocol):
- @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
+ @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST")
async def run_eval(
self,
- task_id: str,
+ benchmark_id: str,
task_config: BenchmarkConfig,
) -> Job: ...
- @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
+ @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST")
async def evaluate_rows(
self,
- task_id: str,
+ benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse: ...
- @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
- async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
+ @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET")
+ async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
- @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
- async def job_cancel(self, task_id: str, job_id: str) -> None: ...
+ @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+ async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
- @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
- async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
+ @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+ async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ...
diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/eval_tasks/__init__.py
deleted file mode 100644
index f8f564957..000000000
--- a/llama_stack/apis/eval_tasks/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .benchmarks import * # noqa: F401 F403
diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py
deleted file mode 100644
index 7c8ed8dc0..000000000
--- a/llama_stack/apis/eval_tasks/eval_tasks.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
-
-from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-class CommonBenchmarkFields(BaseModel):
- dataset_id: str
- scoring_functions: List[str]
- metadata: Dict[str, Any] = Field(
- default_factory=dict,
- description="Metadata for this evaluation task",
- )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
- type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
-
- @property
- def task_id(self) -> str:
- return self.identifier
-
- @property
- def provider_benchmark_id(self) -> str:
- return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
- task_id: str
- provider_id: Optional[str] = None
- provider_benchmark_id: Optional[str] = None
-
-
-class ListBenchmarksResponse(BaseModel):
- data: List[Benchmark]
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
- @webmethod(route="/eval/tasks", method="GET")
- async def list_benchmarks(self) -> ListBenchmarksResponse: ...
-
- @webmethod(route="/eval/tasks/{task_id}", method="GET")
- async def get_benchmark(
- self,
- task_id: str,
- ) -> Optional[Benchmark]: ...
-
- @webmethod(route="/eval/tasks", method="POST")
- async def register_benchmark(
- self,
- task_id: str,
- dataset_id: str,
- scoring_functions: List[str],
- provider_benchmark_id: Optional[str] = None,
- provider_id: Optional[str] = None,
- metadata: Optional[Dict[str, Any]] = None,
- ) -> None: ...
-
- @webmethod(route="/eval-tasks", method="GET")
- async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ...
-
- @webmethod(route="/eval-tasks/{benchmark_id}", method="GET")
- async def DEPRECATED_get_benchmark(
- self,
- benchmark_id: str,
- ) -> Optional[Benchmark]: ...
-
- @webmethod(route="/eval-tasks", method="POST")
- async def DEPRECATED_register_benchmark(
- self,
- benchmark_id: str,
- dataset_id: str,
- scoring_functions: List[str],
- provider_benchmark_id: Optional[str] = None,
- provider_id: Optional[str] = None,
- metadata: Optional[Dict[str, Any]] = None,
- ) -> None: ...
diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index 379ac49ca..7b9b303f4 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -105,7 +105,7 @@ class DownloadTask:
output_file: str
total_size: int = 0
downloaded_size: int = 0
- task_id: Optional[int] = None
+ benchmark_id: Optional[int] = None
retries: int = 0
max_retries: int = 3
@@ -183,8 +183,8 @@ class ParallelDownloader:
)
# Update the progress bar's total size once we know it
- if task.task_id is not None:
- self.progress.update(task.task_id, total=task.total_size)
+ if task.benchmark_id is not None:
+ self.progress.update(task.benchmark_id, total=task.total_size)
except httpx.HTTPError as e:
self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
@@ -207,7 +207,7 @@ class ParallelDownloader:
file.write(chunk)
task.downloaded_size += len(chunk)
self.progress.update(
- task.task_id,
+ task.benchmark_id,
completed=task.downloaded_size,
)
@@ -234,7 +234,7 @@ class ParallelDownloader:
if os.path.exists(task.output_file):
if self.verify_file_integrity(task):
self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
- self.progress.update(task.task_id, completed=task.total_size)
+ self.progress.update(task.benchmark_id, completed=task.total_size)
return
await self.prepare_download(task)
@@ -258,7 +258,7 @@ class ParallelDownloader:
raise DownloadError(f"Download failed: {str(e)}") from e
except Exception as e:
- self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
+ self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]")
raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
@@ -293,7 +293,7 @@ class ParallelDownloader:
with self.progress:
for task in tasks:
desc = f"Downloading {Path(task.output_file).name}"
- task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
+ task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
index 47993c361..ca72ca581 100644
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
) as progress:
for filepath, expected_hash in checksums.items():
full_path = model_dir / filepath
- task_id = progress.add_task(f"Verifying {filepath}...", total=None)
+ benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None)
exists = full_path.exists()
actual_hash = None
@@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
)
)
- progress.remove_task(task_id)
+ progress.remove_task(benchmark_id)
return results
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 6697b03e2..f9f306767 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -347,23 +347,23 @@ class EvalRouter(Eval):
async def run_eval(
self,
- task_id: str,
+ benchmark_id: str,
task_config: AppBenchmarkConfig,
) -> Job:
- return await self.routing_table.get_provider_impl(task_id).run_eval(
- task_id=task_id,
+ return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+ benchmark_id=benchmark_id,
task_config=task_config,
)
async def evaluate_rows(
self,
- task_id: str,
+ benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse:
- return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
- task_id=task_id,
+ return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+ benchmark_id=benchmark_id,
input_rows=input_rows,
scoring_functions=scoring_functions,
task_config=task_config,
@@ -371,28 +371,28 @@ class EvalRouter(Eval):
async def job_status(
self,
- task_id: str,
+ benchmark_id: str,
job_id: str,
) -> Optional[JobStatus]:
- return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
+ return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
async def job_cancel(
self,
- task_id: str,
+ benchmark_id: str,
job_id: str,
) -> None:
- await self.routing_table.get_provider_impl(task_id).job_cancel(
- task_id,
+ await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+ benchmark_id,
job_id,
)
async def job_result(
self,
- task_id: str,
+ benchmark_id: str,
job_id: str,
) -> EvaluateResponse:
- return await self.routing_table.get_provider_impl(task_id).job_result(
- task_id,
+ return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+ benchmark_id,
job_id,
)
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 6c1b06ed6..a52ab7fbd 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
async def list_benchmarks(self) -> ListBenchmarksResponse:
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
- async def get_benchmark(self, task_id: str) -> Optional[Benchmark]:
- return await self.get_object_by_identifier("benchmark", task_id)
+ async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
+ return await self.get_object_by_identifier("benchmark", benchmark_id)
async def register_benchmark(
self,
- task_id: str,
+ benchmark_id: str,
dataset_id: str,
scoring_functions: List[str],
metadata: Optional[Dict[str, Any]] = None,
@@ -455,9 +455,9 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
"No provider specified and multiple providers available. Please specify a provider_id."
)
if provider_benchmark_id is None:
- provider_benchmark_id = task_id
+ provider_benchmark_id = benchmark_id
benchmark = Benchmark(
- identifier=task_id,
+ identifier=benchmark_id,
dataset_id=dataset_id,
scoring_functions=scoring_functions,
metadata=metadata,
diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py
index 39385dd14..753c574a2 100644
--- a/llama_stack/distribution/ui/page/evaluations/native_eval.py
+++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py
@@ -211,7 +211,7 @@ def run_evaluation_3():
progress_bar.progress(progress, text=progress_text)
# Run evaluation for current row
eval_res = llama_stack_api.client.eval.evaluate_rows(
- task_id=selected_benchmark,
+ benchmark_id=selected_benchmark,
input_rows=[r],
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
task_config=benchmark_config,
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 07310f59c..a02418e74 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -83,10 +83,10 @@ class MetaReferenceEvalImpl(
async def run_eval(
self,
- task_id: str,
+ benchmark_id: str,
task_config: BenchmarkConfig,
) -> Job:
- task_def = self.benchmarks[task_id]
+ task_def = self.benchmarks[benchmark_id]
dataset_id = task_def.dataset_id
candidate = task_config.eval_candidate
scoring_functions = task_def.scoring_functions
@@ -97,7 +97,7 @@ class MetaReferenceEvalImpl(
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
)
res = await self.evaluate_rows(
- task_id=task_id,
+ benchmark_id=benchmark_id,
input_rows=all_rows.rows,
scoring_functions=scoring_functions,
task_config=task_config,
@@ -189,7 +189,7 @@ class MetaReferenceEvalImpl(
async def evaluate_rows(
self,
- task_id: str,
+ benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
@@ -219,17 +219,17 @@ class MetaReferenceEvalImpl(
return EvaluateResponse(generations=generations, scores=score_response.results)
- async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
+ async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
if job_id in self.jobs:
return JobStatus.completed
return None
- async def job_cancel(self, task_id: str, job_id: str) -> None:
+ async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
raise NotImplementedError("Job cancel is not implemented yet")
- async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
- status = await self.job_status(task_id, job_id)
+ async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
+ status = await self.job_status(benchmark_id, job_id)
if not status or status != JobStatus.completed:
raise ValueError(f"Job is not completed, Status: {status.value}")