diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index d1c27e901..918eab225 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -189,6 +189,20 @@ def get_distribution_template() -> DistributionTemplate: uri="huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main", ), ), + DatasetInput( + dataset_id="gpqa_cot_diamond", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_diamond", + ), + ), + DatasetInput( + dataset_id="gpqa", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/gpqa_0shot?split=test&name=gpqa_main", + ), + ), DatasetInput( dataset_id="math_500", purpose=DatasetPurpose.eval_messages_answer, @@ -210,6 +224,34 @@ def get_distribution_template() -> DistributionTemplate: uri="huggingface://datasets/llamastack/docvqa?split=val", ), ), + DatasetInput( + dataset_id="MMMU", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/mmmu_v3?split=validation", + ), + ), + DatasetInput( + dataset_id="MMMU_Pro_standard", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/MMMU_Pro?name=standard%20(10%20options)&split=test", + ), + ), + DatasetInput( + dataset_id="MMMU_Pro_vision", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/MMMU_Pro?name=vision&split=test", + ), + ), + DatasetInput( + dataset_id="ai2d", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/ai2d?split=test", + ), + ), ] default_benchmarks = [ @@ -243,6 +285,16 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="docvqa", scoring_functions=["basic::docvqa"], ), + BenchmarkInput( + benchmark_id="meta-reference-MMMU_Pro_standard", + dataset_id="MMMU_Pro_standard", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), + BenchmarkInput( + benchmark_id="meta-reference-MMMU_Pro_vision", + dataset_id="MMMU_Pro_vision", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), ] return DistributionTemplate( name=name, diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 80a517fe8..47100070f 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -176,6 +176,18 @@ datasets: uri: huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main metadata: {} dataset_id: gpqa_cot +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_diamond + metadata: {} + dataset_id: gpqa_cot_diamond +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/gpqa_0shot?split=test&name=gpqa_main + metadata: {} + dataset_id: gpqa - purpose: eval/messages-answer source: type: uri @@ -194,6 +206,30 @@ datasets: uri: huggingface://datasets/llamastack/docvqa?split=val metadata: {} dataset_id: docvqa +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/mmmu_v3?split=validation + metadata: {} + dataset_id: MMMU +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/MMMU_Pro?name=standard%20(10%20options)&split=test + metadata: {} + dataset_id: MMMU_Pro_standard +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/MMMU_Pro?name=vision&split=test + metadata: {} + dataset_id: MMMU_Pro_vision +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/ai2d?split=test + metadata: {} + dataset_id: ai2d scoring_fns: [] benchmarks: - dataset_id: simpleqa @@ -226,6 +262,16 @@ benchmarks: - basic::docvqa metadata: {} benchmark_id: meta-reference-docvqa +- dataset_id: MMMU_Pro_standard + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-MMMU_Pro_standard +- dataset_id: MMMU_Pro_vision + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-MMMU_Pro_vision tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search