forked from phoenix-oss/llama-stack-mirror
fix: Misleading code in Llama Stack Benchmark Evals notebook (#1774)
# What does this PR do? Closes #1773 Signed-off-by: Daniele Martinoli <dmartino@redhat.com>
This commit is contained in:
parent
441016bee8
commit
ba14552a32
1 changed files with 8 additions and 7 deletions
|
@ -963,16 +963,19 @@
|
|||
"\n",
|
||||
"client.benchmarks.register(\n",
|
||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
|
||||
" # `input_rows` argument and does not fetch data from the dataset.\n",
|
||||
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
" # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
|
||||
" scoring_functions=[],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" input_rows=eval_rows,\n",
|
||||
" # Note: Here we define the actual scoring functions.\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||
|
@ -1139,12 +1142,11 @@
|
|||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.data,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"model\",\n",
|
||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||
|
@ -1288,12 +1290,11 @@
|
|||
" \"enable_session_persistence\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.data,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" benchmark_config={\n",
|
||||
" \"type\": \"benchmark\",\n",
|
||||
" \"eval_candidate\": {\n",
|
||||
" \"type\": \"agent\",\n",
|
||||
" \"config\": agent_config,\n",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue