diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index 918eab225..68d984f18 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -270,6 +270,16 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="gpqa_cot", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ), + BenchmarkInput( + benchmark_id="meta-reference-gpqa-cot-diamond", + dataset_id="gpqa_cot_diamond", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), + BenchmarkInput( + benchmark_id="meta-reference-gpqa", + dataset_id="gpqa", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), BenchmarkInput( benchmark_id="meta-reference-math-500", dataset_id="math_500", @@ -285,6 +295,11 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="docvqa", scoring_functions=["basic::docvqa"], ), + BenchmarkInput( + benchmark_id="meta-reference-MMMU", + dataset_id="MMMU", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), BenchmarkInput( benchmark_id="meta-reference-MMMU_Pro_standard", dataset_id="MMMU_Pro_standard", @@ -295,6 +310,11 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="MMMU_Pro_vision", scoring_functions=["basic::regex_parser_multiple_choice_answer"], ), + BenchmarkInput( + benchmark_id="meta-reference-ai2d", + dataset_id="ai2d", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), ] return DistributionTemplate( name=name, diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 47100070f..a5c96bc79 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -247,6 +247,16 @@ benchmarks: - basic::regex_parser_multiple_choice_answer metadata: {} benchmark_id: meta-reference-gpqa-cot +- dataset_id: gpqa_cot_diamond + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-gpqa-cot-diamond +- dataset_id: gpqa + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-gpqa - dataset_id: math_500 scoring_functions: - basic::regex_parser_math_response @@ -262,6 +272,11 @@ benchmarks: - basic::docvqa metadata: {} benchmark_id: meta-reference-docvqa +- dataset_id: MMMU + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-MMMU - dataset_id: MMMU_Pro_standard scoring_functions: - basic::regex_parser_multiple_choice_answer @@ -272,6 +287,11 @@ benchmarks: - basic::regex_parser_multiple_choice_answer metadata: {} benchmark_id: meta-reference-MMMU_Pro_vision +- dataset_id: ai2d + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-ai2d tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search