mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-12 13:00:39 +00:00
replace task_id -> benchmark_id
This commit is contained in:
parent
b20742fce7
commit
017d24fe65
16 changed files with 89 additions and 182 deletions
26
docs/_static/llama-stack-spec.html
vendored
26
docs/_static/llama-stack-spec.html
vendored
|
@ -613,7 +613,7 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/tasks/{task_id}/evaluations": {
|
"/v1/eval/tasks/{benchmark_id}/evaluations": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -633,7 +633,7 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
{
|
{
|
||||||
"name": "task_id",
|
"name": "benchmark_id",
|
||||||
"in": "path",
|
"in": "path",
|
||||||
"required": true,
|
"required": true,
|
||||||
"schema": {
|
"schema": {
|
||||||
|
@ -811,7 +811,7 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/tasks/{task_id}": {
|
"/v1/eval/tasks/{benchmark_id}": {
|
||||||
"get": {
|
"get": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -838,7 +838,7 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
{
|
{
|
||||||
"name": "task_id",
|
"name": "benchmark_id",
|
||||||
"in": "path",
|
"in": "path",
|
||||||
"required": true,
|
"required": true,
|
||||||
"schema": {
|
"schema": {
|
||||||
|
@ -1431,7 +1431,7 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/tasks/{task_id}/jobs/{job_id}": {
|
"/v1/eval/tasks/{benchmark_id}/jobs/{job_id}": {
|
||||||
"get": {
|
"get": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -1458,7 +1458,7 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
{
|
{
|
||||||
"name": "task_id",
|
"name": "benchmark_id",
|
||||||
"in": "path",
|
"in": "path",
|
||||||
"required": true,
|
"required": true,
|
||||||
"schema": {
|
"schema": {
|
||||||
|
@ -1487,7 +1487,7 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
{
|
{
|
||||||
"name": "task_id",
|
"name": "benchmark_id",
|
||||||
"in": "path",
|
"in": "path",
|
||||||
"required": true,
|
"required": true,
|
||||||
"schema": {
|
"schema": {
|
||||||
|
@ -1505,7 +1505,7 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/tasks/{task_id}/jobs/{job_id}/result": {
|
"/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result": {
|
||||||
"get": {
|
"get": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -1533,7 +1533,7 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "task_id",
|
"name": "benchmark_id",
|
||||||
"in": "path",
|
"in": "path",
|
||||||
"required": true,
|
"required": true,
|
||||||
"schema": {
|
"schema": {
|
||||||
|
@ -2204,7 +2204,7 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/eval/tasks/{task_id}/jobs": {
|
"/v1/eval/tasks/{benchmark_id}/jobs": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
|
@ -2224,7 +2224,7 @@
|
||||||
"description": "",
|
"description": "",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
{
|
{
|
||||||
"name": "task_id",
|
"name": "benchmark_id",
|
||||||
"in": "path",
|
"in": "path",
|
||||||
"required": true,
|
"required": true,
|
||||||
"schema": {
|
"schema": {
|
||||||
|
@ -7361,7 +7361,7 @@
|
||||||
"RegisterBenchmarkRequest": {
|
"RegisterBenchmarkRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"task_id": {
|
"benchmark_id": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
},
|
},
|
||||||
"dataset_id": {
|
"dataset_id": {
|
||||||
|
@ -7407,7 +7407,7 @@
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"task_id",
|
"benchmark_id",
|
||||||
"dataset_id",
|
"dataset_id",
|
||||||
"scoring_functions"
|
"scoring_functions"
|
||||||
]
|
]
|
||||||
|
|
26
docs/_static/llama-stack-spec.yaml
vendored
26
docs/_static/llama-stack-spec.yaml
vendored
|
@ -372,7 +372,7 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/EmbeddingsRequest'
|
$ref: '#/components/schemas/EmbeddingsRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/eval/tasks/{task_id}/evaluations:
|
/v1/eval/tasks/{benchmark_id}/evaluations:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -385,7 +385,7 @@ paths:
|
||||||
- Eval
|
- Eval
|
||||||
description: ''
|
description: ''
|
||||||
parameters:
|
parameters:
|
||||||
- name: task_id
|
- name: benchmark_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
|
@ -490,7 +490,7 @@ paths:
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
/v1/eval/tasks/{task_id}:
|
/v1/eval/tasks/{benchmark_id}:
|
||||||
get:
|
get:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -505,7 +505,7 @@ paths:
|
||||||
- Benchmarks
|
- Benchmarks
|
||||||
description: ''
|
description: ''
|
||||||
parameters:
|
parameters:
|
||||||
- name: task_id
|
- name: benchmark_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
|
@ -852,7 +852,7 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/InvokeToolRequest'
|
$ref: '#/components/schemas/InvokeToolRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/eval/tasks/{task_id}/jobs/{job_id}:
|
/v1/eval/tasks/{benchmark_id}/jobs/{job_id}:
|
||||||
get:
|
get:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -867,7 +867,7 @@ paths:
|
||||||
- Eval
|
- Eval
|
||||||
description: ''
|
description: ''
|
||||||
parameters:
|
parameters:
|
||||||
- name: task_id
|
- name: benchmark_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
|
@ -885,7 +885,7 @@ paths:
|
||||||
- Eval
|
- Eval
|
||||||
description: ''
|
description: ''
|
||||||
parameters:
|
parameters:
|
||||||
- name: task_id
|
- name: benchmark_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
|
@ -895,7 +895,7 @@ paths:
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
/v1/eval/tasks/{task_id}/jobs/{job_id}/result:
|
/v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result:
|
||||||
get:
|
get:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -913,7 +913,7 @@ paths:
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
- name: task_id
|
- name: benchmark_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
|
@ -1328,7 +1328,7 @@ paths:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: string
|
type: string
|
||||||
/v1/eval/tasks/{task_id}/jobs:
|
/v1/eval/tasks/{benchmark_id}/jobs:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
|
@ -1341,7 +1341,7 @@ paths:
|
||||||
- Eval
|
- Eval
|
||||||
description: ''
|
description: ''
|
||||||
parameters:
|
parameters:
|
||||||
- name: task_id
|
- name: benchmark_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
|
@ -4678,7 +4678,7 @@ components:
|
||||||
RegisterBenchmarkRequest:
|
RegisterBenchmarkRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
task_id:
|
benchmark_id:
|
||||||
type: string
|
type: string
|
||||||
dataset_id:
|
dataset_id:
|
||||||
type: string
|
type: string
|
||||||
|
@ -4702,7 +4702,7 @@ components:
|
||||||
- type: object
|
- type: object
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- task_id
|
- benchmark_id
|
||||||
- dataset_id
|
- dataset_id
|
||||||
- scoring_functions
|
- scoring_functions
|
||||||
RegisterModelRequest:
|
RegisterModelRequest:
|
||||||
|
|
|
@ -1024,7 +1024,7 @@
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" task_id=\"meta-reference::mmmu\",\n",
|
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||||
" input_rows=eval_rows,\n",
|
" input_rows=eval_rows,\n",
|
||||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||||
" task_config={\n",
|
" task_config={\n",
|
||||||
|
@ -1203,7 +1203,7 @@
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" task_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.rows,\n",
|
" input_rows=eval_rows.rows,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" task_config={\n",
|
" task_config={\n",
|
||||||
|
@ -1352,7 +1352,7 @@
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" task_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.rows,\n",
|
" input_rows=eval_rows.rows,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" task_config={\n",
|
" task_config={\n",
|
||||||
|
|
|
@ -48,7 +48,7 @@ client.benchmarks.register(
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::mmmu",
|
benchmark_id="meta-reference::mmmu",
|
||||||
input_rows=eval_rows,
|
input_rows=eval_rows,
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -106,7 +106,7 @@ client.benchmarks.register(
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -156,7 +156,7 @@ agent_config = {
|
||||||
}
|
}
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
|
|
@ -18,7 +18,7 @@ response = client.benchmarks.register(
|
||||||
|
|
||||||
# Run evaluation
|
# Run evaluation
|
||||||
job = client.eval.run_eval(
|
job = client.eval.run_eval(
|
||||||
task_id="my_eval",
|
benchmark_id="my_eval",
|
||||||
task_config={
|
task_config={
|
||||||
"type": "app",
|
"type": "app",
|
||||||
"eval_candidate": {"type": "agent", "config": agent_config},
|
"eval_candidate": {"type": "agent", "config": agent_config},
|
||||||
|
@ -26,5 +26,5 @@ job = client.eval.run_eval(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get results
|
# Get results
|
||||||
result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
|
result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
|
||||||
```
|
```
|
||||||
|
|
|
@ -84,7 +84,7 @@ client.benchmarks.register(
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::mmmu",
|
benchmark_id="meta-reference::mmmu",
|
||||||
input_rows=eval_rows,
|
input_rows=eval_rows,
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -142,7 +142,7 @@ client.benchmarks.register(
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -192,7 +192,7 @@ agent_config = {
|
||||||
}
|
}
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
|
|
@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
|
||||||
|
|
||||||
Methods:
|
Methods:
|
||||||
|
|
||||||
- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||||
- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
||||||
|
|
||||||
### Jobs
|
### Jobs
|
||||||
|
|
||||||
|
@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
|
||||||
|
|
||||||
Methods:
|
Methods:
|
||||||
|
|
||||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||||
- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
|
- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
|
||||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
|
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
|
||||||
|
|
||||||
## Inspect
|
## Inspect
|
||||||
|
|
||||||
|
|
|
@ -76,27 +76,27 @@ class EvaluateResponse(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class Eval(Protocol):
|
class Eval(Protocol):
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
@webmethod(route="/eval/tasks/{benchmark_id}/jobs", method="POST")
|
||||||
async def run_eval(
|
async def run_eval(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
task_config: BenchmarkConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> Job: ...
|
) -> Job: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
|
@webmethod(route="/eval/tasks/{benchmark_id}/evaluations", method="POST")
|
||||||
async def evaluate_rows(
|
async def evaluate_rows(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
input_rows: List[Dict[str, Any]],
|
input_rows: List[Dict[str, Any]],
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
task_config: BenchmarkConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> EvaluateResponse: ...
|
) -> EvaluateResponse: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
|
@webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="GET")
|
||||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
|
@webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}", method="DELETE")
|
||||||
async def job_cancel(self, task_id: str, job_id: str) -> None: ...
|
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
|
@webmethod(route="/eval/tasks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
||||||
async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
|
async def job_result(self, job_id: str, benchmark_id: str) -> EvaluateResponse: ...
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
from .benchmarks import * # noqa: F401 F403
|
|
|
@ -1,86 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
|
||||||
|
|
||||||
from llama_models.schema_utils import json_schema_type, webmethod
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
from llama_stack.apis.resource import Resource, ResourceType
|
|
||||||
|
|
||||||
|
|
||||||
class CommonBenchmarkFields(BaseModel):
|
|
||||||
dataset_id: str
|
|
||||||
scoring_functions: List[str]
|
|
||||||
metadata: Dict[str, Any] = Field(
|
|
||||||
default_factory=dict,
|
|
||||||
description="Metadata for this evaluation task",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class Benchmark(CommonBenchmarkFields, Resource):
|
|
||||||
type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
|
|
||||||
|
|
||||||
@property
|
|
||||||
def task_id(self) -> str:
|
|
||||||
return self.identifier
|
|
||||||
|
|
||||||
@property
|
|
||||||
def provider_benchmark_id(self) -> str:
|
|
||||||
return self.provider_resource_id
|
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkInput(CommonBenchmarkFields, BaseModel):
|
|
||||||
task_id: str
|
|
||||||
provider_id: Optional[str] = None
|
|
||||||
provider_benchmark_id: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class ListBenchmarksResponse(BaseModel):
|
|
||||||
data: List[Benchmark]
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class Benchmarks(Protocol):
|
|
||||||
@webmethod(route="/eval/tasks", method="GET")
|
|
||||||
async def list_benchmarks(self) -> ListBenchmarksResponse: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}", method="GET")
|
|
||||||
async def get_benchmark(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
) -> Optional[Benchmark]: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks", method="POST")
|
|
||||||
async def register_benchmark(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
dataset_id: str,
|
|
||||||
scoring_functions: List[str],
|
|
||||||
provider_benchmark_id: Optional[str] = None,
|
|
||||||
provider_id: Optional[str] = None,
|
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
|
||||||
) -> None: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks", method="GET")
|
|
||||||
async def DEPRECATED_list_benchmarks(self) -> ListBenchmarksResponse: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks/{benchmark_id}", method="GET")
|
|
||||||
async def DEPRECATED_get_benchmark(
|
|
||||||
self,
|
|
||||||
benchmark_id: str,
|
|
||||||
) -> Optional[Benchmark]: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks", method="POST")
|
|
||||||
async def DEPRECATED_register_benchmark(
|
|
||||||
self,
|
|
||||||
benchmark_id: str,
|
|
||||||
dataset_id: str,
|
|
||||||
scoring_functions: List[str],
|
|
||||||
provider_benchmark_id: Optional[str] = None,
|
|
||||||
provider_id: Optional[str] = None,
|
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
|
||||||
) -> None: ...
|
|
|
@ -105,7 +105,7 @@ class DownloadTask:
|
||||||
output_file: str
|
output_file: str
|
||||||
total_size: int = 0
|
total_size: int = 0
|
||||||
downloaded_size: int = 0
|
downloaded_size: int = 0
|
||||||
task_id: Optional[int] = None
|
benchmark_id: Optional[int] = None
|
||||||
retries: int = 0
|
retries: int = 0
|
||||||
max_retries: int = 3
|
max_retries: int = 3
|
||||||
|
|
||||||
|
@ -183,8 +183,8 @@ class ParallelDownloader:
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update the progress bar's total size once we know it
|
# Update the progress bar's total size once we know it
|
||||||
if task.task_id is not None:
|
if task.benchmark_id is not None:
|
||||||
self.progress.update(task.task_id, total=task.total_size)
|
self.progress.update(task.benchmark_id, total=task.total_size)
|
||||||
|
|
||||||
except httpx.HTTPError as e:
|
except httpx.HTTPError as e:
|
||||||
self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
|
self.console.print(f"[red]Error getting file info: {str(e)}[/red]")
|
||||||
|
@ -207,7 +207,7 @@ class ParallelDownloader:
|
||||||
file.write(chunk)
|
file.write(chunk)
|
||||||
task.downloaded_size += len(chunk)
|
task.downloaded_size += len(chunk)
|
||||||
self.progress.update(
|
self.progress.update(
|
||||||
task.task_id,
|
task.benchmark_id,
|
||||||
completed=task.downloaded_size,
|
completed=task.downloaded_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -234,7 +234,7 @@ class ParallelDownloader:
|
||||||
if os.path.exists(task.output_file):
|
if os.path.exists(task.output_file):
|
||||||
if self.verify_file_integrity(task):
|
if self.verify_file_integrity(task):
|
||||||
self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
|
self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
|
||||||
self.progress.update(task.task_id, completed=task.total_size)
|
self.progress.update(task.benchmark_id, completed=task.total_size)
|
||||||
return
|
return
|
||||||
|
|
||||||
await self.prepare_download(task)
|
await self.prepare_download(task)
|
||||||
|
@ -258,7 +258,7 @@ class ParallelDownloader:
|
||||||
raise DownloadError(f"Download failed: {str(e)}") from e
|
raise DownloadError(f"Download failed: {str(e)}") from e
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
|
self.progress.update(task.benchmark_id, description=f"[red]Failed: {task.output_file}[/red]")
|
||||||
raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
|
raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
|
||||||
|
|
||||||
def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
|
def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
|
||||||
|
@ -293,7 +293,7 @@ class ParallelDownloader:
|
||||||
with self.progress:
|
with self.progress:
|
||||||
for task in tasks:
|
for task in tasks:
|
||||||
desc = f"Downloading {Path(task.output_file).name}"
|
desc = f"Downloading {Path(task.output_file).name}"
|
||||||
task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
|
task.benchmark_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
|
semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
|
||||||
|
|
||||||
|
|
|
@ -82,7 +82,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
|
||||||
) as progress:
|
) as progress:
|
||||||
for filepath, expected_hash in checksums.items():
|
for filepath, expected_hash in checksums.items():
|
||||||
full_path = model_dir / filepath
|
full_path = model_dir / filepath
|
||||||
task_id = progress.add_task(f"Verifying {filepath}...", total=None)
|
benchmark_id = progress.add_task(f"Verifying {filepath}...", total=None)
|
||||||
|
|
||||||
exists = full_path.exists()
|
exists = full_path.exists()
|
||||||
actual_hash = None
|
actual_hash = None
|
||||||
|
@ -102,7 +102,7 @@ def verify_files(model_dir: Path, checksums: Dict[str, str], console: Console) -
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
progress.remove_task(task_id)
|
progress.remove_task(benchmark_id)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
|
@ -347,23 +347,23 @@ class EvalRouter(Eval):
|
||||||
|
|
||||||
async def run_eval(
|
async def run_eval(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
task_config: AppBenchmarkConfig,
|
task_config: AppBenchmarkConfig,
|
||||||
) -> Job:
|
) -> Job:
|
||||||
return await self.routing_table.get_provider_impl(task_id).run_eval(
|
return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
task_config=task_config,
|
task_config=task_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def evaluate_rows(
|
async def evaluate_rows(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
input_rows: List[Dict[str, Any]],
|
input_rows: List[Dict[str, Any]],
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
task_config: BenchmarkConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> EvaluateResponse:
|
) -> EvaluateResponse:
|
||||||
return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
|
return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
input_rows=input_rows,
|
input_rows=input_rows,
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
task_config=task_config,
|
task_config=task_config,
|
||||||
|
@ -371,28 +371,28 @@ class EvalRouter(Eval):
|
||||||
|
|
||||||
async def job_status(
|
async def job_status(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
job_id: str,
|
job_id: str,
|
||||||
) -> Optional[JobStatus]:
|
) -> Optional[JobStatus]:
|
||||||
return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
|
return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
|
||||||
|
|
||||||
async def job_cancel(
|
async def job_cancel(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
job_id: str,
|
job_id: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
await self.routing_table.get_provider_impl(task_id).job_cancel(
|
await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
|
||||||
task_id,
|
benchmark_id,
|
||||||
job_id,
|
job_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def job_result(
|
async def job_result(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
job_id: str,
|
job_id: str,
|
||||||
) -> EvaluateResponse:
|
) -> EvaluateResponse:
|
||||||
return await self.routing_table.get_provider_impl(task_id).job_result(
|
return await self.routing_table.get_provider_impl(benchmark_id).job_result(
|
||||||
task_id,
|
benchmark_id,
|
||||||
job_id,
|
job_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -433,12 +433,12 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
||||||
async def list_benchmarks(self) -> ListBenchmarksResponse:
|
async def list_benchmarks(self) -> ListBenchmarksResponse:
|
||||||
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
|
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
|
||||||
|
|
||||||
async def get_benchmark(self, task_id: str) -> Optional[Benchmark]:
|
async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
|
||||||
return await self.get_object_by_identifier("benchmark", task_id)
|
return await self.get_object_by_identifier("benchmark", benchmark_id)
|
||||||
|
|
||||||
async def register_benchmark(
|
async def register_benchmark(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
@ -455,9 +455,9 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
||||||
"No provider specified and multiple providers available. Please specify a provider_id."
|
"No provider specified and multiple providers available. Please specify a provider_id."
|
||||||
)
|
)
|
||||||
if provider_benchmark_id is None:
|
if provider_benchmark_id is None:
|
||||||
provider_benchmark_id = task_id
|
provider_benchmark_id = benchmark_id
|
||||||
benchmark = Benchmark(
|
benchmark = Benchmark(
|
||||||
identifier=task_id,
|
identifier=benchmark_id,
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
|
|
|
@ -211,7 +211,7 @@ def run_evaluation_3():
|
||||||
progress_bar.progress(progress, text=progress_text)
|
progress_bar.progress(progress, text=progress_text)
|
||||||
# Run evaluation for current row
|
# Run evaluation for current row
|
||||||
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
||||||
task_id=selected_benchmark,
|
benchmark_id=selected_benchmark,
|
||||||
input_rows=[r],
|
input_rows=[r],
|
||||||
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
|
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
|
||||||
task_config=benchmark_config,
|
task_config=benchmark_config,
|
||||||
|
|
|
@ -83,10 +83,10 @@ class MetaReferenceEvalImpl(
|
||||||
|
|
||||||
async def run_eval(
|
async def run_eval(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
task_config: BenchmarkConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> Job:
|
) -> Job:
|
||||||
task_def = self.benchmarks[task_id]
|
task_def = self.benchmarks[benchmark_id]
|
||||||
dataset_id = task_def.dataset_id
|
dataset_id = task_def.dataset_id
|
||||||
candidate = task_config.eval_candidate
|
candidate = task_config.eval_candidate
|
||||||
scoring_functions = task_def.scoring_functions
|
scoring_functions = task_def.scoring_functions
|
||||||
|
@ -97,7 +97,7 @@ class MetaReferenceEvalImpl(
|
||||||
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
|
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
|
||||||
)
|
)
|
||||||
res = await self.evaluate_rows(
|
res = await self.evaluate_rows(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
input_rows=all_rows.rows,
|
input_rows=all_rows.rows,
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
task_config=task_config,
|
task_config=task_config,
|
||||||
|
@ -189,7 +189,7 @@ class MetaReferenceEvalImpl(
|
||||||
|
|
||||||
async def evaluate_rows(
|
async def evaluate_rows(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
input_rows: List[Dict[str, Any]],
|
input_rows: List[Dict[str, Any]],
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
task_config: BenchmarkConfig,
|
task_config: BenchmarkConfig,
|
||||||
|
@ -219,17 +219,17 @@ class MetaReferenceEvalImpl(
|
||||||
|
|
||||||
return EvaluateResponse(generations=generations, scores=score_response.results)
|
return EvaluateResponse(generations=generations, scores=score_response.results)
|
||||||
|
|
||||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
|
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
|
||||||
if job_id in self.jobs:
|
if job_id in self.jobs:
|
||||||
return JobStatus.completed
|
return JobStatus.completed
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def job_cancel(self, task_id: str, job_id: str) -> None:
|
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
|
||||||
raise NotImplementedError("Job cancel is not implemented yet")
|
raise NotImplementedError("Job cancel is not implemented yet")
|
||||||
|
|
||||||
async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
|
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
|
||||||
status = await self.job_status(task_id, job_id)
|
status = await self.job_status(benchmark_id, job_id)
|
||||||
if not status or status != JobStatus.completed:
|
if not status or status != JobStatus.completed:
|
||||||
raise ValueError(f"Job is not completed, Status: {status.value}")
|
raise ValueError(f"Job is not completed, Status: {status.value}")
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue