diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 2102eec0d..5de7f715e 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -963,16 +963,19 @@ "\n", "client.benchmarks.register(\n", " benchmark_id=\"meta-reference::mmmu\",\n", + " # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n", + " # `input_rows` argument and does not fetch data from the dataset.\n", " dataset_id=f\"mmmu-{subset}-{split}\",\n", - " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", + " # Note: for the same reason as above, we can use any value as `scoring_functions`.\n", + " scoring_functions=[],\n", ")\n", "\n", - "response = client.eval.evaluate_rows_alpha(\n", + "response = client.eval.evaluate_rows(\n", " benchmark_id=\"meta-reference::mmmu\",\n", " input_rows=eval_rows,\n", + " # Note: Here we define the actual scoring functions.\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " benchmark_config={\n", - " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", @@ -1139,12 +1142,11 @@ " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", ")\n", "\n", - "response = client.eval.evaluate_rows_alpha(\n", + "response = client.eval.evaluate_rows(\n", " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.data,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " benchmark_config={\n", - " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", @@ -1288,12 +1290,11 @@ " \"enable_session_persistence\": False,\n", "}\n", "\n", - "response = client.eval.evaluate_rows_alpha(\n", + "response = client.eval.evaluate_rows(\n", " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.data,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " benchmark_config={\n", - " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"agent\",\n", " \"config\": agent_config,\n",