diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index febd1691b..8d4b81792 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -215,8 +215,8 @@ def get_distribution_template() -> DistributionTemplate: purpose=DatasetPurpose.eval_messages_answer, source=URIDataSource( uri="huggingface://datasets/llamastack/docvqa?split=val", - ) - ) + ), + ), ] default_benchmarks = [ @@ -254,7 +254,7 @@ def get_distribution_template() -> DistributionTemplate: benchmark_id="meta-reference-docvqa", dataset_id="docvqa", scoring_functions=["basic::docvqa"], - ) + ), ] return DistributionTemplate( name=name, diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 8dbf51472..a7136c596 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -188,6 +188,18 @@ datasets: uri: huggingface://datasets/llamastack/bfcl_v3?split=train metadata: {} dataset_id: bfcl +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/IfEval?split=train + metadata: {} + dataset_id: ifeval +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/docvqa?split=val + metadata: {} + dataset_id: docvqa scoring_fns: [] benchmarks: - dataset_id: simpleqa @@ -215,6 +227,16 @@ benchmarks: - basic::bfcl metadata: {} benchmark_id: meta-reference-bfcl +- dataset_id: ifeval + scoring_functions: + - basic::ifeval + metadata: {} + benchmark_id: meta-reference-ifeval +- dataset_id: docvqa + scoring_functions: + - basic::docvqa + metadata: {} + benchmark_id: meta-reference-docvqa tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search