feat: [new open benchmark] BFCL_v3 (#1578)

# What does this PR do? create a new dataset BFCL_v3 from https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html overall each question asks the model to perform a task described in natural language, and additionally a set of available functions and their schema are given for the model to choose from. the model is required to write the function call form including function name and parameters , to achieve the stated purpose. the results are validated against provided ground truth, to make sure that the generated function call and the ground truth function call are syntactically and semantically equivalent, by checking their AST . ## Test Plan start server by ``` llama stack run ./llama_stack/templates/ollama/run.yaml ``` then send traffic ``` llama-stack-client eval run-benchmark "bfcl" --model-id meta-llama/Llama-3.2-3B-Instruct --output-dir /tmp/gpqa --num-examples 2 ``` [//]: # (## Documentation)
2025-03-14 12:50:49 -07:00 · 2025-03-14 12:50:49 -07:00 · a626b7bce3
commit a626b7bce3
parent 78d4872c0c
15 changed files with 1546 additions and 9 deletions
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -226,6 +226,22 @@ def get_distribution_template() -> DistributionTemplate:
                "chat_completion_input": {"type": "string"},
            },
        ),
+        DatasetInput(
+            dataset_id="bfcl",
+            provider_id="huggingface",
+            url=URL(uri="https://huggingface.co/datasets/llamastack/bfcl_v3"),
+            metadata={
+                "path": "llamastack/bfcl_v3",
+                "split": "train",
+            },
+            dataset_schema={
+                "function": {"type": "string"},
+                "language": {"type": "string"},
+                "ground_truth": {"type": "string"},
+                "id": {"type": "string"},
+                "chat_completion_input": {"type": "string"},
+            },
+        ),
    ]

    default_benchmarks = [
@ -249,6 +265,11 @@ def get_distribution_template() -> DistributionTemplate:
            dataset_id="math_500",
            scoring_functions=["basic::regex_parser_math_response"],
        ),
+        BenchmarkInput(
+            benchmark_id="meta-reference-bfcl",
+            dataset_id="bfcl",
+            scoring_functions=["basic::bfcl"],
+        ),
    ]
    return DistributionTemplate(
        name=name,
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -216,6 +216,24 @@ datasets:
    split: test
  dataset_id: math_500
  provider_id: huggingface
+- dataset_schema:
+    function:
+      type: string
+    language:
+      type: string
+    ground_truth:
+      type: string
+    id:
+      type: string
+    chat_completion_input:
+      type: string
+  url:
+    uri: https://huggingface.co/datasets/llamastack/bfcl_v3
+  metadata:
+    path: llamastack/bfcl_v3
+    split: train
+  dataset_id: bfcl
+  provider_id: huggingface
 scoring_fns: []
 benchmarks:
 - dataset_id: simpleqa
@ -238,6 +256,11 @@ benchmarks:
  - basic::regex_parser_math_response
  metadata: {}
  benchmark_id: meta-reference-math-500
+- dataset_id: bfcl
+  scoring_functions:
+  - basic::bfcl
+  metadata: {}
+  benchmark_id: meta-reference-bfcl
 tool_groups:
 - toolgroup_id: builtin::websearch
  provider_id: tavily-search