From 017d24fe6561005b2debdddb5935e4c475629862 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Feb 2025 20:34:32 -0800 Subject: [PATCH] replace task_id -> benchmark_id --- docs/_static/llama-stack-spec.html | 26 +++--- docs/_static/llama-stack-spec.yaml | 26 +++--- .../Llama_Stack_Benchmark_Evals.ipynb | 6 +- docs/source/building_applications/evals.md | 6 +- .../building_applications/evaluation.md | 4 +- .../references/evals_reference/index.md | 6 +- .../references/python_sdk_reference/index.md | 10 +-- llama_stack/apis/eval/eval.py | 20 ++--- llama_stack/apis/eval_tasks/__init__.py | 7 -- llama_stack/apis/eval_tasks/eval_tasks.py | 86 ------------------- llama_stack/cli/download.py | 14 +-- llama_stack/cli/verify_download.py | 4 +- llama_stack/distribution/routers/routers.py | 28 +++--- .../distribution/routers/routing_tables.py | 10 +-- .../ui/page/evaluations/native_eval.py | 2 +- .../inline/eval/meta_reference/eval.py | 16 ++-- 16 files changed, 89 insertions(+), 182 deletions(-) delete mode 100644 llama_stack/apis/eval_tasks/__init__.py delete mode 100644 llama_stack/apis/eval_tasks/eval_tasks.py diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 84c6fd99d..c656808a6 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -613,7 +613,7 @@ } } }, - "/v1/eval/tasks/{task_id}/evaluations": { + "/v1/eval/tasks/{benchmark_id}/evaluations": { "post": { "responses": { "200": { @@ -633,7 +633,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -811,7 +811,7 @@ ] } }, - "/v1/eval/tasks/{task_id}": { + "/v1/eval/tasks/{benchmark_id}": { "get": { "responses": { "200": { @@ -838,7 +838,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1431,7 +1431,7 @@ } } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}": { + "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": { "get": { "responses": { "200": { @@ -1458,7 +1458,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1487,7 +1487,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -1505,7 +1505,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs/{job_id}/result": { + "/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": { "get": { "responses": { "200": { @@ -1533,7 +1533,7 @@ } }, { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -2204,7 +2204,7 @@ ] } }, - "/v1/eval/tasks/{task_id}/jobs": { + "/v1/eval/tasks/{benchmark_id}/jobs": { "post": { "responses": { "200": { @@ -2224,7 +2224,7 @@ "description": "", "parameters": [ { - "name": "task_id", + "name": "benchmark_id", "in": "path", "required": true, "schema": { @@ -7361,7 +7361,7 @@ "RegisterBenchmarkRequest": { "type": "object", "properties": { - "task_id": { + "benchmark_id": { "type": "string" }, "dataset_id": { @@ -7407,7 +7407,7 @@ }, "additionalProperties": false, "required": [ - "task_id", + "benchmark_id", "dataset_id", "scoring_functions" ] diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index dd0951fde..0f0a613a8 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -372,7 +372,7 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/tasks/{task_id}/evaluations: + /v1/eval/tasks/{benchmark_id}/evaluations: post: responses: '200': @@ -385,7 +385,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -490,7 +490,7 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}: + /v1/eval/tasks/{benchmark_id}: get: responses: '200': @@ -505,7 +505,7 @@ paths: - Benchmarks description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -852,7 +852,7 @@ paths: schema: $ref: '#/components/schemas/InvokeToolRequest' required: true - /v1/eval/tasks/{task_id}/jobs/{job_id}: + /v1/eval/tasks/{benchmark_id}/jobs/{job_id}: get: responses: '200': @@ -867,7 +867,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -885,7 +885,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -895,7 +895,7 @@ paths: required: true schema: type: string - /v1/eval/tasks/{task_id}/jobs/{job_id}/result: + /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result: get: responses: '200': @@ -913,7 +913,7 @@ paths: required: true schema: type: string - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -1328,7 +1328,7 @@ paths: type: array items: type: string - /v1/eval/tasks/{task_id}/jobs: + /v1/eval/tasks/{benchmark_id}/jobs: post: responses: '200': @@ -1341,7 +1341,7 @@ paths: - Eval description: '' parameters: - - name: task_id + - name: benchmark_id in: path required: true schema: @@ -4678,7 +4678,7 @@ components: RegisterBenchmarkRequest: type: object properties: - task_id: + benchmark_id: type: string dataset_id: type: string @@ -4702,7 +4702,7 @@ components: - type: object additionalProperties: false required: - - task_id + - benchmark_id - dataset_id - scoring_functions RegisterModelRequest: diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 6e8480f94..599df201a 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -1024,7 +1024,7 @@ ")\n", "\n", "response = client.eval.evaluate_rows(\n", - " task_id=\"meta-reference::mmmu\",\n", + " benchmark_id=\"meta-reference::mmmu\",\n", " input_rows=eval_rows,\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " task_config={\n", @@ -1203,7 +1203,7 @@ ")\n", "\n", "response = client.eval.evaluate_rows(\n", - " task_id=\"meta-reference::simpleqa\",\n", + " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " task_config={\n", @@ -1352,7 +1352,7 @@ "}\n", "\n", "response = client.eval.evaluate_rows(\n", - " task_id=\"meta-reference::simpleqa\",\n", + " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " task_config={\n", diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md index c1c371ca8..f28e0d5fd 100644 --- a/docs/source/building_applications/evals.md +++ b/docs/source/building_applications/evals.md @@ -48,7 +48,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -106,7 +106,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -156,7 +156,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md index df18c146c..ad220f751 100644 --- a/docs/source/building_applications/evaluation.md +++ b/docs/source/building_applications/evaluation.md @@ -18,7 +18,7 @@ response = client.benchmarks.register( # Run evaluation job = client.eval.run_eval( - task_id="my_eval", + benchmark_id="my_eval", task_config={ "type": "app", "eval_candidate": {"type": "agent", "config": agent_config}, @@ -26,5 +26,5 @@ job = client.eval.run_eval( ) # Get results -result = client.eval.job_result(task_id="my_eval", job_id=job.job_id) +result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id) ``` diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md index f0275511d..71dbb47e5 100644 --- a/docs/source/references/evals_reference/index.md +++ b/docs/source/references/evals_reference/index.md @@ -84,7 +84,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::mmmu", + benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], task_config={ @@ -142,7 +142,7 @@ client.benchmarks.register( ) response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ @@ -192,7 +192,7 @@ agent_config = { } response = client.eval.evaluate_rows( - task_id="meta-reference::simpleqa", + benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], task_config={ diff --git a/docs/source/references/python_sdk_reference/index.md b/docs/source/references/python_sdk_reference/index.md index eca8c58f5..9d1130422 100644 --- a/docs/source/references/python_sdk_reference/index.md +++ b/docs/source/references/python_sdk_reference/index.md @@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job Methods: -- client.eval.evaluate_rows(task_id, \*\*params) -> EvaluateResponse -- client.eval.run_eval(task_id, \*\*params) -> Job +- client.eval.evaluate_rows(benchmark_id, \*\*params) -> EvaluateResponse +- client.eval.run_eval(benchmark_id, \*\*params) -> Job ### Jobs @@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse Methods: -- client.eval.jobs.retrieve(job_id, \*, task_id) -> EvaluateResponse -- client.eval.jobs.cancel(job_id, \*, task_id) -> None -- client.eval.jobs.status(job_id, \*, task_id) -> Optional[JobStatusResponse] +- client.eval.jobs.retrieve(job_id, \*, benchmark_id) -> EvaluateResponse +- client.eval.jobs.cancel(job_id, \*, benchmark_id) -> None +- client.eval.jobs.status(job_id, \*, benchmark_id) -> Optional[JobStatusResponse] ## Inspect diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index 16b96d618..273ef657c 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel): class Eval(Protocol): - @webmethod(route="/eval/tasks/{task_id}/jobs", method="POST") + @webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST") async def run_eval( self, - task_id: str, + benchmark_id: str, task_config: BenchmarkConfig, ) -> Job: ... - @webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST") + @webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST") async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], task_config: BenchmarkConfig, ) -> EvaluateResponse: ... - @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET") - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ... + @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET") + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... - @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE") - async def job_cancel(self, task_id: str, job_id: str) -> None: ... + @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE") + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... - @webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ... + @webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET") + async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ... diff --git a/llama_stack/apis/eval_tasks/__init__.py b/llama_stack/apis/eval_tasks/__init__.py deleted file mode 100644 index f8f564957..000000000 --- a/llama_stack/apis/eval_tasks/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .benchmarks import * # noqa: F401 F403 diff --git a/llama_stack/apis/eval_tasks/eval_tasks.py b/llama_stack/apis/eval_tasks/eval_tasks.py deleted file mode 100644 index 7c8ed8dc0..000000000 --- a/llama_stack/apis/eval_tasks/eval_tasks.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. -from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable - -from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel, Field - -from llama_stack.apis.resource import Resource, ResourceType - - -class CommonBenchmarkFields(BaseModel): - dataset_id: str - scoring_functions: List[str] - metadata: Dict[str, Any] = Field( - default_factory=dict, - description="Metadata for this evaluation task", - ) - - -@json_schema_type -class Benchmark(CommonBenchmarkFields, Resource): - type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value - - @property - def task_id(self) -> str: - return self.identifier - - @property - def provider_benchmark_id(self) -> str: - return self.provider_resource_id - - -class BenchmarkInput(CommonBenchmarkFields, BaseModel): - task_id: str - provider_id: Optional[str] = None - provider_benchmark_id: Optional[str] = None - - -class ListBenchmarksResponse(BaseModel): - data: List[Benchmark] - - -@runtime_checkable -class Benchmarks(Protocol): - @webmethod(route="/eval/tasks", method="GET") - async def list_benchmarks(self) -> ListBenchmarksResponse: ... - - @webmethod(route="/eval/tasks/{task_id}", method="GET") - async def get_benchmark( - self, - task_id: str, - ) -> Optional[Benchmark]: ... - - @webmethod(route="/eval/tasks", method="POST") - async def register_benchmark( - self, - task_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_benchmark_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... - - @webmethod(route="/eval-tasks", method="GET") - async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ... - - @webmethod(route="/eval-tasks/{benchmark_id}", method="GET") - async def DEPRECATED_get_benchmark( - self, - benchmark_id: str, - ) -> Optional[Benchmark]: ... - - @webmethod(route="/eval-tasks", method="POST") - async def DEPRECATED_register_benchmark( - self, - benchmark_id: str, - dataset_id: str, - scoring_functions: List[str], - provider_benchmark_id: Optional[str] = None, - provider_id: Optional[str] = None, - metadata: Optional[Dict[str, Any]] = None, - ) -> None: ... diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py index 379ac49ca..7b9b303f4 100644 --- a/llama_stack/cli/download.py +++ b/llama_stack/cli/download.py @@ -105,7 +105,7 @@ class DownloadTask: output_file: str total_size: int = 0 downloaded_size: int = 0 - task_id: Optional[int] = None + benchmark_id: Optional[int] = None retries: int = 0 max_retries: int = 3 @@ -183,8 +183,8 @@ class ParallelDownloader: ) # Update the progress bar's total size once we know it - if task.task_id is not None: - self.progress.update(task.task_id, total=task.total_size) + if task.benchmark_id is not None: + self.progress.update(task.benchmark_id, total=task.total_size) except httpx.HTTPError as e: self.console.print(f"[red]Error getting file info: {str(e)}[/red]") @@ -207,7 +207,7 @@ class ParallelDownloader: file.write(chunk) task.downloaded_size += len(chunk) self.progress.update( - task.task_id, + task.benchmark_id, completed=task.downloaded_size, ) @@ -234,7 +234,7 @@ class ParallelDownloader: if os.path.exists(task.output_file): if self.verify_file_integrity(task): self.console.print(f"[green]Already downloaded {task.output_file}[/green]") - self.progress.update(task.task_id, completed=task.total_size) + self.progress.update(task.benchmark_id, completed=task.total_size) return await self.prepare_download(task) @@ -258,7 +258,7 @@ class ParallelDownloader: raise DownloadError(f"Download failed: {str(e)}") from e except Exception as e: - self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]") + self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]") raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e def has_disk_space(self, tasks: List[DownloadTask]) -> bool: @@ -293,7 +293,7 @@ class ParallelDownloader: with self.progress: for task in tasks: desc = f"Downloading {Path(task.output_file).name}" - task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size) + task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size) semaphore = asyncio.Semaphore(self.max_concurrent_downloads) diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py index 47993c361..ca72ca581 100644 --- a/llama_stack/cli/verify_download.py +++ b/llama_stack/cli/verify_download.py @@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) - ) as progress: for filepath, expected_hash in checksums.items(): full_path = model_dir / filepath - task_id = progress.add_task(f"Verifying {filepath}...", total=None) + benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None) exists = full_path.exists() actual_hash = None @@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) - ) ) - progress.remove_task(task_id) + progress.remove_task(benchmark_id) return results diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 6697b03e2..f9f306767 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -347,23 +347,23 @@ class EvalRouter(Eval): async def run_eval( self, - task_id: str, + benchmark_id: str, task_config: AppBenchmarkConfig, ) -> Job: - return await self.routing_table.get_provider_impl(task_id).run_eval( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).run_eval( + benchmark_id=benchmark_id, task_config=task_config, ) async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], task_config: BenchmarkConfig, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).evaluate_rows( - task_id=task_id, + return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows( + benchmark_id=benchmark_id, input_rows=input_rows, scoring_functions=scoring_functions, task_config=task_config, @@ -371,28 +371,28 @@ class EvalRouter(Eval): async def job_status( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> Optional[JobStatus]: - return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id) + return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id) async def job_cancel( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> None: - await self.routing_table.get_provider_impl(task_id).job_cancel( - task_id, + await self.routing_table.get_provider_impl(benchmark_id).job_cancel( + benchmark_id, job_id, ) async def job_result( self, - task_id: str, + benchmark_id: str, job_id: str, ) -> EvaluateResponse: - return await self.routing_table.get_provider_impl(task_id).job_result( - task_id, + return await self.routing_table.get_provider_impl(benchmark_id).job_result( + benchmark_id, job_id, ) diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 6c1b06ed6..a52ab7fbd 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): async def list_benchmarks(self) -> ListBenchmarksResponse: return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark")) - async def get_benchmark(self, task_id: str) -> Optional[Benchmark]: - return await self.get_object_by_identifier("benchmark", task_id) + async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]: + return await self.get_object_by_identifier("benchmark", benchmark_id) async def register_benchmark( self, - task_id: str, + benchmark_id: str, dataset_id: str, scoring_functions: List[str], metadata: Optional[Dict[str, Any]] = None, @@ -455,9 +455,9 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): "No provider specified and multiple providers available. Please specify a provider_id." ) if provider_benchmark_id is None: - provider_benchmark_id = task_id + provider_benchmark_id = benchmark_id benchmark = Benchmark( - identifier=task_id, + identifier=benchmark_id, dataset_id=dataset_id, scoring_functions=scoring_functions, metadata=metadata, diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index 39385dd14..753c574a2 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -211,7 +211,7 @@ def run_evaluation_3(): progress_bar.progress(progress, text=progress_text) # Run evaluation for current row eval_res = llama_stack_api.client.eval.evaluate_rows( - task_id=selected_benchmark, + benchmark_id=selected_benchmark, input_rows=[r], scoring_functions=benchmarks[selected_benchmark].scoring_functions, task_config=benchmark_config, diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 07310f59c..a02418e74 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -83,10 +83,10 @@ class MetaReferenceEvalImpl( async def run_eval( self, - task_id: str, + benchmark_id: str, task_config: BenchmarkConfig, ) -> Job: - task_def = self.benchmarks[task_id] + task_def = self.benchmarks[benchmark_id] dataset_id = task_def.dataset_id candidate = task_config.eval_candidate scoring_functions = task_def.scoring_functions @@ -97,7 +97,7 @@ class MetaReferenceEvalImpl( rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples), ) res = await self.evaluate_rows( - task_id=task_id, + benchmark_id=benchmark_id, input_rows=all_rows.rows, scoring_functions=scoring_functions, task_config=task_config, @@ -189,7 +189,7 @@ class MetaReferenceEvalImpl( async def evaluate_rows( self, - task_id: str, + benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], task_config: BenchmarkConfig, @@ -219,17 +219,17 @@ class MetaReferenceEvalImpl( return EvaluateResponse(generations=generations, scores=score_response.results) - async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: + async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: if job_id in self.jobs: return JobStatus.completed return None - async def job_cancel(self, task_id: str, job_id: str) -> None: + async def job_cancel(self, benchmark_id: str, job_id: str) -> None: raise NotImplementedError("Job cancel is not implemented yet") - async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse: - status = await self.job_status(task_id, job_id) + async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: + status = await self.job_status(benchmark_id, job_id) if not status or status != JobStatus.completed: raise ValueError(f"Job is not completed, Status: {status.value}")