From 2a5374dbe3b902bb4a7c519bb95ba648707381b7 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Mar 2025 15:04:35 -0700 Subject: [PATCH] regen pre-commit hooks --- .../open-benchmark/open_benchmark.py | 20 +++++++++++++++++++ llama_stack/templates/open-benchmark/run.yaml | 20 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index 918eab225..68d984f18 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -270,6 +270,16 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="gpqa_cot", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ), + BenchmarkInput( + benchmark_id="meta-reference-gpqa-cot-diamond", + dataset_id="gpqa_cot_diamond", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), + BenchmarkInput( + benchmark_id="meta-reference-gpqa", + dataset_id="gpqa", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), BenchmarkInput( benchmark_id="meta-reference-math-500", dataset_id="math_500", @@ -285,6 +295,11 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="docvqa", scoring_functions=["basic::docvqa"], ), + BenchmarkInput( + benchmark_id="meta-reference-MMMU", + dataset_id="MMMU", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), BenchmarkInput( benchmark_id="meta-reference-MMMU_Pro_standard", dataset_id="MMMU_Pro_standard", @@ -295,6 +310,11 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="MMMU_Pro_vision", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ), + BenchmarkInput( + benchmark_id="meta-reference-ai2d", + dataset_id="ai2d", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), ] return DistributionTemplate( name=name, diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 47100070f..a5c96bc79 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -247,6 +247,16 @@ benchmarks: - basic::regex_parser_multiple_choice_answer metadata: {} benchmark_id: meta-reference-gpqa-cot +- dataset_id: gpqa_cot_diamond + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-gpqa-cot-diamond +- dataset_id: gpqa + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-gpqa - dataset_id: math_500 scoring_functions: - basic::regex_parser_math_response @@ -262,6 +272,11 @@ benchmarks: - basic::docvqa metadata: {} benchmark_id: meta-reference-docvqa +- dataset_id: MMMU + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-MMMU - dataset_id: MMMU_Pro_standard scoring_functions: - basic::regex_parser_multiple_choice_answer @@ -272,6 +287,11 @@ benchmarks: - basic::regex_parser_multiple_choice_answer metadata: {} benchmark_id: meta-reference-MMMU_Pro_vision +- dataset_id: ai2d + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-ai2d tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search