mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
fix: Misleading code in Llama Stack Benchmark Evals notebook (#1774)
# What does this PR do? Closes #1773 Signed-off-by: Daniele Martinoli <dmartino@redhat.com>
This commit is contained in:
parent
441016bee8
commit
ba14552a32
1 changed files with 8 additions and 7 deletions
|
@ -963,16 +963,19 @@
|
||||||
"\n",
|
"\n",
|
||||||
"client.benchmarks.register(\n",
|
"client.benchmarks.register(\n",
|
||||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||||
|
" # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
|
||||||
|
" # `input_rows` argument and does not fetch data from the dataset.\n",
|
||||||
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
||||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
" # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
|
||||||
|
" scoring_functions=[],\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows_alpha(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||||
" input_rows=eval_rows,\n",
|
" input_rows=eval_rows,\n",
|
||||||
|
" # Note: Here we define the actual scoring functions.\n",
|
||||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||||
" benchmark_config={\n",
|
" benchmark_config={\n",
|
||||||
" \"type\": \"benchmark\",\n",
|
|
||||||
" \"eval_candidate\": {\n",
|
" \"eval_candidate\": {\n",
|
||||||
" \"type\": \"model\",\n",
|
" \"type\": \"model\",\n",
|
||||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||||
|
@ -1139,12 +1142,11 @@
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows_alpha(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.data,\n",
|
" input_rows=eval_rows.data,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" benchmark_config={\n",
|
" benchmark_config={\n",
|
||||||
" \"type\": \"benchmark\",\n",
|
|
||||||
" \"eval_candidate\": {\n",
|
" \"eval_candidate\": {\n",
|
||||||
" \"type\": \"model\",\n",
|
" \"type\": \"model\",\n",
|
||||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||||
|
@ -1288,12 +1290,11 @@
|
||||||
" \"enable_session_persistence\": False,\n",
|
" \"enable_session_persistence\": False,\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows_alpha(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.data,\n",
|
" input_rows=eval_rows.data,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" benchmark_config={\n",
|
" benchmark_config={\n",
|
||||||
" \"type\": \"benchmark\",\n",
|
|
||||||
" \"eval_candidate\": {\n",
|
" \"eval_candidate\": {\n",
|
||||||
" \"type\": \"agent\",\n",
|
" \"type\": \"agent\",\n",
|
||||||
" \"config\": agent_config,\n",
|
" \"config\": agent_config,\n",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue