mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-19 03:10:03 +00:00
chore: rename task_config to benchmark_config (#1397)
# What does this PR do? - This was missed from previous deprecation: https://github.com/meta-llama/llama-stack/pull/1186 - Part of https://github.com/meta-llama/llama-stack/issues/1396 [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` pytest -v -s --nbval-lax ./llama-stack/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` [//]: # (## Documentation)
This commit is contained in:
parent
158b6dc404
commit
e9a37bad63
12 changed files with 55 additions and 46 deletions
8
docs/_static/llama-stack-spec.html
vendored
8
docs/_static/llama-stack-spec.html
vendored
|
@ -6355,7 +6355,7 @@
|
|||
"type": "string"
|
||||
}
|
||||
},
|
||||
"task_config": {
|
||||
"benchmark_config": {
|
||||
"$ref": "#/components/schemas/BenchmarkConfig"
|
||||
}
|
||||
},
|
||||
|
@ -6363,7 +6363,7 @@
|
|||
"required": [
|
||||
"input_rows",
|
||||
"scoring_functions",
|
||||
"task_config"
|
||||
"benchmark_config"
|
||||
],
|
||||
"title": "EvaluateRowsRequest"
|
||||
},
|
||||
|
@ -9248,13 +9248,13 @@
|
|||
"RunEvalRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task_config": {
|
||||
"benchmark_config": {
|
||||
"$ref": "#/components/schemas/BenchmarkConfig"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"task_config"
|
||||
"benchmark_config"
|
||||
],
|
||||
"title": "RunEvalRequest"
|
||||
},
|
||||
|
|
8
docs/_static/llama-stack-spec.yaml
vendored
8
docs/_static/llama-stack-spec.yaml
vendored
|
@ -4357,13 +4357,13 @@ components:
|
|||
type: array
|
||||
items:
|
||||
type: string
|
||||
task_config:
|
||||
benchmark_config:
|
||||
$ref: '#/components/schemas/BenchmarkConfig'
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input_rows
|
||||
- scoring_functions
|
||||
- task_config
|
||||
- benchmark_config
|
||||
title: EvaluateRowsRequest
|
||||
EvaluateResponse:
|
||||
type: object
|
||||
|
@ -6168,11 +6168,11 @@ components:
|
|||
RunEvalRequest:
|
||||
type: object
|
||||
properties:
|
||||
task_config:
|
||||
benchmark_config:
|
||||
$ref: '#/components/schemas/BenchmarkConfig'
|
||||
additionalProperties: false
|
||||
required:
|
||||
- task_config
|
||||
- benchmark_config
|
||||
title: RunEvalRequest
|
||||
Job:
|
||||
type: object
|
||||
|
|
|
@ -3675,7 +3675,7 @@
|
|||
" benchmark_id=\"llama3.2-3B-instruct:tax_eval\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"braintrust::answer-similarity\"],\n",
|
||||
" task_config={\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
|
@ -6383,7 +6383,7 @@
|
|||
" benchmark_id=\"Llama-3.2-3B-Instruct-sft-0:tax_eval\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"braintrust::answer-similarity\"],\n",
|
||||
" task_config={\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
|
|
|
@ -781,7 +781,7 @@
|
|||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" input_rows=eval_rows,\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
" task_config={\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
|
@ -960,7 +960,7 @@
|
|||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" task_config={\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
|
@ -1109,7 +1109,7 @@
|
|||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" task_config={\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"agent\",\n",
|
||||
|
|
|
@ -51,7 +51,7 @@ response = client.eval.evaluate_rows(
|
|||
benchmark_id="meta-reference::mmmu",
|
||||
input_rows=eval_rows,
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
task_config={
|
||||
benchmark_config={
|
||||
"type": "benchmark",
|
||||
"eval_candidate": {
|
||||
"type": "model",
|
||||
|
@ -109,7 +109,7 @@ response = client.eval.evaluate_rows(
|
|||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
benchmark_config={
|
||||
"type": "benchmark",
|
||||
"eval_candidate": {
|
||||
"type": "model",
|
||||
|
@ -158,7 +158,7 @@ response = client.eval.evaluate_rows(
|
|||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
benchmark_config={
|
||||
"type": "benchmark",
|
||||
"eval_candidate": {
|
||||
"type": "agent",
|
||||
|
|
|
@ -19,7 +19,7 @@ response = client.benchmarks.register(
|
|||
# Run evaluation
|
||||
job = client.eval.run_eval(
|
||||
benchmark_id="my_eval",
|
||||
task_config={
|
||||
benchmark_config={
|
||||
"type": "app",
|
||||
"eval_candidate": {"type": "agent", "config": agent_config},
|
||||
},
|
||||
|
|
|
@ -87,7 +87,7 @@ response = client.eval.evaluate_rows(
|
|||
benchmark_id="meta-reference::mmmu",
|
||||
input_rows=eval_rows,
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
task_config={
|
||||
benchmark_config={
|
||||
"type": "benchmark",
|
||||
"eval_candidate": {
|
||||
"type": "model",
|
||||
|
@ -145,7 +145,7 @@ response = client.eval.evaluate_rows(
|
|||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
benchmark_config={
|
||||
"type": "benchmark",
|
||||
"eval_candidate": {
|
||||
"type": "model",
|
||||
|
@ -195,7 +195,7 @@ response = client.eval.evaluate_rows(
|
|||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
benchmark_config={
|
||||
"type": "benchmark",
|
||||
"eval_candidate": {
|
||||
"type": "agent",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue