mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-10 04:08:31 +00:00
replace task_id -> benchmark_id
This commit is contained in:
parent
b20742fce7
commit
017d24fe65
16 changed files with 89 additions and 182 deletions
26
docs/_static/llama-stack-spec.html
vendored
26
docs/_static/llama-stack-spec.html
vendored
|
@ -613,7 +613,7 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/eval/tasks/{task_id}/evaluations": {
|
||||
"/v1/eval/tasks/{benchmark_id}/evaluations": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
|
@ -633,7 +633,7 @@
|
|||
"description": "",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "task_id",
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
|
@ -811,7 +811,7 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/eval/tasks/{task_id}": {
|
||||
"/v1/eval/tasks/{benchmark_id}": {
|
||||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
|
@ -838,7 +838,7 @@
|
|||
"description": "",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "task_id",
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
|
@ -1431,7 +1431,7 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/eval/tasks/{task_id}/jobs/{job_id}": {
|
||||
"/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
|
||||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
|
@ -1458,7 +1458,7 @@
|
|||
"description": "",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "task_id",
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
|
@ -1487,7 +1487,7 @@
|
|||
"description": "",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "task_id",
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
|
@ -1505,7 +1505,7 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
|
||||
"/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
|
||||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
|
@ -1533,7 +1533,7 @@
|
|||
}
|
||||
},
|
||||
{
|
||||
"name": "task_id",
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
|
@ -2204,7 +2204,7 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/eval/tasks/{task_id}/jobs": {
|
||||
"/v1/eval/tasks/{benchmark_id}/jobs": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
|
@ -2224,7 +2224,7 @@
|
|||
"description": "",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "task_id",
|
||||
"name": "benchmark_id",
|
||||
"in": "path",
|
||||
"required": true,
|
||||
"schema": {
|
||||
|
@ -7361,7 +7361,7 @@
|
|||
"RegisterBenchmarkRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task_id": {
|
||||
"benchmark_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"dataset_id": {
|
||||
|
@ -7407,7 +7407,7 @@
|
|||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"task_id",
|
||||
"benchmark_id",
|
||||
"dataset_id",
|
||||
"scoring_functions"
|
||||
]
|
||||
|
|
26
docs/_static/llama-stack-spec.yaml
vendored
26
docs/_static/llama-stack-spec.yaml
vendored
|
@ -372,7 +372,7 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/EmbeddingsRequest'
|
||||
required: true
|
||||
/v1/eval/tasks/{task_id}/evaluations:
|
||||
/v1/eval/tasks/{benchmark_id}/evaluations:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
|
@ -385,7 +385,7 @@ paths:
|
|||
- Eval
|
||||
description: ''
|
||||
parameters:
|
||||
- name: task_id
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
|
@ -490,7 +490,7 @@ paths:
|
|||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/eval/tasks/{task_id}:
|
||||
/v1/eval/tasks/{benchmark_id}:
|
||||
get:
|
||||
responses:
|
||||
'200':
|
||||
|
@ -505,7 +505,7 @@ paths:
|
|||
- Benchmarks
|
||||
description: ''
|
||||
parameters:
|
||||
- name: task_id
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
|
@ -852,7 +852,7 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/InvokeToolRequest'
|
||||
required: true
|
||||
/v1/eval/tasks/{task_id}/jobs/{job_id}:
|
||||
/v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
|
||||
get:
|
||||
responses:
|
||||
'200':
|
||||
|
@ -867,7 +867,7 @@ paths:
|
|||
- Eval
|
||||
description: ''
|
||||
parameters:
|
||||
- name: task_id
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
|
@ -885,7 +885,7 @@ paths:
|
|||
- Eval
|
||||
description: ''
|
||||
parameters:
|
||||
- name: task_id
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
|
@ -895,7 +895,7 @@ paths:
|
|||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/eval/tasks/{task_id}/jobs/{job_id}/result:
|
||||
/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
|
||||
get:
|
||||
responses:
|
||||
'200':
|
||||
|
@ -913,7 +913,7 @@ paths:
|
|||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: task_id
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
|
@ -1328,7 +1328,7 @@ paths:
|
|||
type: array
|
||||
items:
|
||||
type: string
|
||||
/v1/eval/tasks/{task_id}/jobs:
|
||||
/v1/eval/tasks/{benchmark_id}/jobs:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
|
@ -1341,7 +1341,7 @@ paths:
|
|||
- Eval
|
||||
description: ''
|
||||
parameters:
|
||||
- name: task_id
|
||||
- name: benchmark_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
|
@ -4678,7 +4678,7 @@ components:
|
|||
RegisterBenchmarkRequest:
|
||||
type: object
|
||||
properties:
|
||||
task_id:
|
||||
benchmark_id:
|
||||
type: string
|
||||
dataset_id:
|
||||
type: string
|
||||
|
@ -4702,7 +4702,7 @@ components:
|
|||
- type: object
|
||||
additionalProperties: false
|
||||
required:
|
||||
- task_id
|
||||
- benchmark_id
|
||||
- dataset_id
|
||||
- scoring_functions
|
||||
RegisterModelRequest:
|
||||
|
|
|
@ -1024,7 +1024,7 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" task_id=\"meta-reference::mmmu\",\n",
|
||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" input_rows=eval_rows,\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
" task_config={\n",
|
||||
|
@ -1203,7 +1203,7 @@
|
|||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" task_id=\"meta-reference::simpleqa\",\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" task_config={\n",
|
||||
|
@ -1352,7 +1352,7 @@
|
|||
"}\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" task_id=\"meta-reference::simpleqa\",\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" task_config={\n",
|
||||
|
|
|
@ -48,7 +48,7 @@ client.benchmarks.register(
|
|||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::mmmu",
|
||||
benchmark_id="meta-reference::mmmu",
|
||||
input_rows=eval_rows,
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
task_config={
|
||||
|
@ -106,7 +106,7 @@ client.benchmarks.register(
|
|||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
@ -156,7 +156,7 @@ agent_config = {
|
|||
}
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
|
|
@ -18,7 +18,7 @@ response = client.benchmarks.register(
|
|||
|
||||
# Run evaluation
|
||||
job = client.eval.run_eval(
|
||||
task_id="my_eval",
|
||||
benchmark_id="my_eval",
|
||||
task_config={
|
||||
"type": "app",
|
||||
"eval_candidate": {"type": "agent", "config": agent_config},
|
||||
|
@ -26,5 +26,5 @@ job = client.eval.run_eval(
|
|||
)
|
||||
|
||||
# Get results
|
||||
result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
|
||||
result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
|
||||
```
|
||||
|
|
|
@ -84,7 +84,7 @@ client.benchmarks.register(
|
|||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::mmmu",
|
||||
benchmark_id="meta-reference::mmmu",
|
||||
input_rows=eval_rows,
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
task_config={
|
||||
|
@ -142,7 +142,7 @@ client.benchmarks.register(
|
|||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
@ -192,7 +192,7 @@ agent_config = {
|
|||
}
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
|
|
@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
|
|||
|
||||
Methods:
|
||||
|
||||
- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
||||
- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
||||
|
||||
### Jobs
|
||||
|
||||
|
@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
|
|||
|
||||
Methods:
|
||||
|
||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
|
||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
|
||||
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
|
||||
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
|
||||
|
||||
## Inspect
|
||||
|
||||
|
|
|
@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel):
|
|||
|
||||
|
||||
class Eval(Protocol):
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
||||
@webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST")
|
||||
async def run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
|
||||
@webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST")
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
|
||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
@webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET")
|
||||
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
|
||||
async def job_cancel(self, task_id: str, job_id: str) -> None: ...
|
||||
@webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE")
|
||||
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
|
||||
async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
|
||||
@webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
||||
async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ...
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .benchmarks import * # noqa: F401 F403
|
|
@ -1,86 +0,0 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.apis.resource import Resource, ResourceType
|
||||
|
||||
|
||||
class CommonBenchmarkFields(BaseModel):
|
||||
dataset_id: str
|
||||
scoring_functions: List[str]
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Metadata for this evaluation task",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Benchmark(CommonBenchmarkFields, Resource):
|
||||
type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
|
||||
|
||||
@property
|
||||
def task_id(self) -> str:
|
||||
return self.identifier
|
||||
|
||||
@property
|
||||
def provider_benchmark_id(self) -> str:
|
||||
return self.provider_resource_id
|
||||
|
||||
|
||||
class BenchmarkInput(CommonBenchmarkFields, BaseModel):
|
||||
task_id: str
|
||||
provider_id: Optional[str] = None
|
||||
provider_benchmark_id: Optional[str] = None
|
||||
|
||||
|
||||
class ListBenchmarksResponse(BaseModel):
|
||||
data: List[Benchmark]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class Benchmarks(Protocol):
|
||||
@webmethod(route="/eval/tasks", method="GET")
|
||||
async def list_benchmarks(self) -> ListBenchmarksResponse: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}", method="GET")
|
||||
async def get_benchmark(
|
||||
self,
|
||||
task_id: str,
|
||||
) -> Optional[Benchmark]: ...
|
||||
|
||||
@webmethod(route="/eval/tasks", method="POST")
|
||||
async def register_benchmark(
|
||||
self,
|
||||
task_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None: ...
|
||||
|
||||
@webmethod(route="/eval-tasks", method="GET")
|
||||
async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ...
|
||||
|
||||
@webmethod(route="/eval-tasks/{benchmark_id}", method="GET")
|
||||
async def DEPRECATED_get_benchmark(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
) -> Optional[Benchmark]: ...
|
||||
|
||||
@webmethod(route="/eval-tasks", method="POST")
|
||||
async def DEPRECATED_register_benchmark(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None: ...
|
|
@ -105,7 +105,7 @@ class DownloadTask:
|
|||
output_file: str
|
||||
total_size: int = 0
|
||||
downloaded_size: int = 0
|
||||
task_id: Optional[int] = None
|
||||
benchmark_id: Optional[int] = None
|
||||
retries: int = 0
|
||||
max_retries: int = 3
|
||||
|
||||
|
@ -183,8 +183,8 @@ class ParallelDownloader:
|
|||
)
|
||||
|
||||
# Update the progress bar's total size once we know it
|
||||
if task.task_id is not None:
|
||||
self.progress.update(task.task_id, total=task.total_size)
|
||||
if task.benchmark_id is not None:
|
||||
self.progress.update(task.benchmark_id, total=task.total_size)
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
|
||||
|
@ -207,7 +207,7 @@ class ParallelDownloader:
|
|||
file.write(chunk)
|
||||
task.downloaded_size += len(chunk)
|
||||
self.progress.update(
|
||||
task.task_id,
|
||||
task.benchmark_id,
|
||||
completed=task.downloaded_size,
|
||||
)
|
||||
|
||||
|
@ -234,7 +234,7 @@ class ParallelDownloader:
|
|||
if os.path.exists(task.output_file):
|
||||
if self.verify_file_integrity(task):
|
||||
self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
|
||||
self.progress.update(task.task_id, completed=task.total_size)
|
||||
self.progress.update(task.benchmark_id, completed=task.total_size)
|
||||
return
|
||||
|
||||
await self.prepare_download(task)
|
||||
|
@ -258,7 +258,7 @@ class ParallelDownloader:
|
|||
raise DownloadError(f"Download failed: {str(e)}") from e
|
||||
|
||||
except Exception as e:
|
||||
self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
|
||||
self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]")
|
||||
raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
|
||||
|
||||
def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
|
||||
|
@ -293,7 +293,7 @@ class ParallelDownloader:
|
|||
with self.progress:
|
||||
for task in tasks:
|
||||
desc = f"Downloading {Path(task.output_file).name}"
|
||||
task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
|
||||
task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
|
||||
|
||||
semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
|
|||
) as progress:
|
||||
for filepath, expected_hash in checksums.items():
|
||||
full_path = model_dir / filepath
|
||||
task_id = progress.add_task(f"Verifying {filepath}...", total=None)
|
||||
benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None)
|
||||
|
||||
exists = full_path.exists()
|
||||
actual_hash = None
|
||||
|
@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
|
|||
)
|
||||
)
|
||||
|
||||
progress.remove_task(task_id)
|
||||
progress.remove_task(benchmark_id)
|
||||
|
||||
return results
|
||||
|
||||
|
|
|
@ -347,23 +347,23 @@ class EvalRouter(Eval):
|
|||
|
||||
async def run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
task_config: AppBenchmarkConfig,
|
||||
) -> Job:
|
||||
return await self.routing_table.get_provider_impl(task_id).run_eval(
|
||||
task_id=task_id,
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
|
||||
benchmark_id=benchmark_id,
|
||||
task_config=task_config,
|
||||
)
|
||||
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse:
|
||||
return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
|
||||
task_id=task_id,
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
|
||||
benchmark_id=benchmark_id,
|
||||
input_rows=input_rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
|
@ -371,28 +371,28 @@ class EvalRouter(Eval):
|
|||
|
||||
async def job_status(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
job_id: str,
|
||||
) -> Optional[JobStatus]:
|
||||
return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
|
||||
|
||||
async def job_cancel(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
job_id: str,
|
||||
) -> None:
|
||||
await self.routing_table.get_provider_impl(task_id).job_cancel(
|
||||
task_id,
|
||||
await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
|
||||
benchmark_id,
|
||||
job_id,
|
||||
)
|
||||
|
||||
async def job_result(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
job_id: str,
|
||||
) -> EvaluateResponse:
|
||||
return await self.routing_table.get_provider_impl(task_id).job_result(
|
||||
task_id,
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).job_result(
|
||||
benchmark_id,
|
||||
job_id,
|
||||
)
|
||||
|
||||
|
|
|
@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
|||
async def list_benchmarks(self) -> ListBenchmarksResponse:
|
||||
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
|
||||
|
||||
async def get_benchmark(self, task_id: str) -> Optional[Benchmark]:
|
||||
return await self.get_object_by_identifier("benchmark", task_id)
|
||||
async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
|
||||
return await self.get_object_by_identifier("benchmark", benchmark_id)
|
||||
|
||||
async def register_benchmark(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
|
@ -455,9 +455,9 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
|||
"No provider specified and multiple providers available. Please specify a provider_id."
|
||||
)
|
||||
if provider_benchmark_id is None:
|
||||
provider_benchmark_id = task_id
|
||||
provider_benchmark_id = benchmark_id
|
||||
benchmark = Benchmark(
|
||||
identifier=task_id,
|
||||
identifier=benchmark_id,
|
||||
dataset_id=dataset_id,
|
||||
scoring_functions=scoring_functions,
|
||||
metadata=metadata,
|
||||
|
|
|
@ -211,7 +211,7 @@ def run_evaluation_3():
|
|||
progress_bar.progress(progress, text=progress_text)
|
||||
# Run evaluation for current row
|
||||
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
||||
task_id=selected_benchmark,
|
||||
benchmark_id=selected_benchmark,
|
||||
input_rows=[r],
|
||||
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
|
||||
task_config=benchmark_config,
|
||||
|
|
|
@ -83,10 +83,10 @@ class MetaReferenceEvalImpl(
|
|||
|
||||
async def run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job:
|
||||
task_def = self.benchmarks[task_id]
|
||||
task_def = self.benchmarks[benchmark_id]
|
||||
dataset_id = task_def.dataset_id
|
||||
candidate = task_config.eval_candidate
|
||||
scoring_functions = task_def.scoring_functions
|
||||
|
@ -97,7 +97,7 @@ class MetaReferenceEvalImpl(
|
|||
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
|
||||
)
|
||||
res = await self.evaluate_rows(
|
||||
task_id=task_id,
|
||||
benchmark_id=benchmark_id,
|
||||
input_rows=all_rows.rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
|
@ -189,7 +189,7 @@ class MetaReferenceEvalImpl(
|
|||
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
|
@ -219,17 +219,17 @@ class MetaReferenceEvalImpl(
|
|||
|
||||
return EvaluateResponse(generations=generations, scores=score_response.results)
|
||||
|
||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
|
||||
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
|
||||
if job_id in self.jobs:
|
||||
return JobStatus.completed
|
||||
|
||||
return None
|
||||
|
||||
async def job_cancel(self, task_id: str, job_id: str) -> None:
|
||||
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
|
||||
raise NotImplementedError("Job cancel is not implemented yet")
|
||||
|
||||
async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
|
||||
status = await self.job_status(task_id, job_id)
|
||||
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
|
||||
status = await self.job_status(benchmark_id, job_id)
|
||||
if not status or status != JobStatus.completed:
|
||||
raise ValueError(f"Job is not completed, Status: {status.value}")
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue