fix: Misleading code in Llama Stack Benchmark Evals notebook (#1774)

# What does this PR do?
Closes #1773

Signed-off-by: Daniele Martinoli <dmartino@redhat.com>
This commit is contained in:
Daniele Martinoli 2025-03-25 15:04:47 +01:00 committed by GitHub
parent 441016bee8
commit ba14552a32
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -963,16 +963,19 @@
"\n", "\n",
"client.benchmarks.register(\n", "client.benchmarks.register(\n",
" benchmark_id=\"meta-reference::mmmu\",\n", " benchmark_id=\"meta-reference::mmmu\",\n",
" # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
" # `input_rows` argument and does not fetch data from the dataset.\n",
" dataset_id=f\"mmmu-{subset}-{split}\",\n", " dataset_id=f\"mmmu-{subset}-{split}\",\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
" scoring_functions=[],\n",
")\n", ")\n",
"\n", "\n",
"response = client.eval.evaluate_rows_alpha(\n", "response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::mmmu\",\n", " benchmark_id=\"meta-reference::mmmu\",\n",
" input_rows=eval_rows,\n", " input_rows=eval_rows,\n",
" # Note: Here we define the actual scoring functions.\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
" benchmark_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1139,12 +1142,11 @@
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
")\n", ")\n",
"\n", "\n",
"response = client.eval.evaluate_rows_alpha(\n", "response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.data,\n", " input_rows=eval_rows.data,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" benchmark_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1288,12 +1290,11 @@
" \"enable_session_persistence\": False,\n", " \"enable_session_persistence\": False,\n",
"}\n", "}\n",
"\n", "\n",
"response = client.eval.evaluate_rows_alpha(\n", "response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.data,\n", " input_rows=eval_rows.data,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" benchmark_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"agent\",\n", " \"type\": \"agent\",\n",
" \"config\": agent_config,\n", " \"config\": agent_config,\n",