From ba14552a32f3a6cb49714234e2cd8a3258168891 Mon Sep 17 00:00:00 2001 From: Daniele Martinoli <86618610+dmartinol@users.noreply.github.com> Date: Tue, 25 Mar 2025 15:04:47 +0100 Subject: [PATCH] fix: Misleading code in Llama Stack Benchmark Evals notebook (#1774) # What does this PR do? Closes #1773 Signed-off-by: Daniele Martinoli --- docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 2102eec0d..5de7f715e 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -963,16 +963,19 @@ "\n", "client.benchmarks.register(\n", " benchmark_id=\"meta-reference::mmmu\",\n", + " # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n", + " # `input_rows` argument and does not fetch data from the dataset.\n", " dataset_id=f\"mmmu-{subset}-{split}\",\n", - " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", + " # Note: for the same reason as above, we can use any value as `scoring_functions`.\n", + " scoring_functions=[],\n", ")\n", "\n", - "response = client.eval.evaluate_rows_alpha(\n", + "response = client.eval.evaluate_rows(\n", " benchmark_id=\"meta-reference::mmmu\",\n", " input_rows=eval_rows,\n", + " # Note: Here we define the actual scoring functions.\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " benchmark_config={\n", - " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", @@ -1139,12 +1142,11 @@ " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", ")\n", "\n", - "response = client.eval.evaluate_rows_alpha(\n", + "response = client.eval.evaluate_rows(\n", " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.data,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " benchmark_config={\n", - " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", @@ -1288,12 +1290,11 @@ " \"enable_session_persistence\": False,\n", "}\n", "\n", - "response = client.eval.evaluate_rows_alpha(\n", + "response = client.eval.evaluate_rows(\n", " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.data,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " benchmark_config={\n", - " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"agent\",\n", " \"config\": agent_config,\n",