diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 7616563bb..85b351262 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -12,7 +12,7 @@ from llama_stack.apis.agents import Agents, StepType
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import Inference, UserMessage, SystemMessage
+from llama_stack.apis.inference import Inference, SystemMessage, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
@@ -118,7 +118,7 @@ class MetaReferenceEvalImpl(
         for i, x in tqdm(enumerate(input_rows)):
             assert ColumnName.chat_completion_input.value in x, "Invalid input row"
             input_messages = json.loads(x[ColumnName.chat_completion_input.value])
-            input_messages = [UserMessage(**x) for x in input_messages if x['role'] == 'user']
+            input_messages = [UserMessage(**x) for x in input_messages if x["role"] == "user"]
 
             # NOTE: only single-turn agent generation is supported. Create a new session for each input row
             session_create_response = await self.agents_api.create_agent_session(agent_id, f"session-{i}")
@@ -168,11 +168,11 @@ class MetaReferenceEvalImpl(
                 generations.append({ColumnName.generated_answer.value: response.completion_message.content})
             elif ColumnName.chat_completion_input.value in x:
                 chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
-                input_messages = [UserMessage(**x) for x in chat_completion_input_json if x['role'] == 'user']
+                input_messages = [UserMessage(**x) for x in chat_completion_input_json if x["role"] == "user"]
                 messages = []
                 if candidate.system_message:
                     messages.append(candidate.system_message)
-                messages += [SystemMessage(**x) for x in chat_completion_input_json if x['role'] == 'system']
+                messages += [SystemMessage(**x) for x in chat_completion_input_json if x["role"] == "system"]
                 messages += input_messages
                 response = await self.inference_api.chat_completion(
                     model_id=candidate.model,
diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py
index ffc0a95bb..599f5f98c 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@@ -22,14 +22,19 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )
 
 from .config import BasicScoringConfig
+from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
 from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn
 from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
 from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn
-from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 
-
-FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn, BFCLScoringFn]
+FIXED_FNS = [
+    EqualityScoringFn,
+    SubsetOfScoringFn,
+    RegexParserScoringFn,
+    RegexParserMathResponseScoringFn,
+    BFCLScoringFn,
+]
 
 
 class BasicScoringImpl(
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
index e855ff656..f37780f3e 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@@ -4,18 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import json
+import re
 from typing import Any, Dict, Optional
 
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
 
-from .fn_defs.bfcl import bfcl
 from ..utils.bfcl.ast_parser import decode_ast
 from ..utils.bfcl.checker import ast_checker, is_empty_output
-import json
-import re
-
+from .fn_defs.bfcl import bfcl
 
 
 def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]:
@@ -62,11 +61,7 @@ def gen_relevance_acc(x: Dict[str, Any]) -> Dict[str, float]:
     # If `test_category` is "irrelevance", the model is expected to output no function call.
     # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
     # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
-    acc = (
-        not x["contain_func_call"]
-        if "irrelevance" in x["id"]
-        else x["contain_func_call"]
-    )
+    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
     return {"valid": float(acc)}
 
 
@@ -87,12 +82,12 @@ class BFCLScoringFn(RegisteredBaseScoringFn):
         scoring_fn_identifier: Optional[str] = "bfcl",
         scoring_params: Optional[ScoringFnParams] = None,
     ) -> ScoringResultRow:
-        test_category = re.sub(r'_[0-9_-]+$', '', input_row['id'])
+        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
         score_result = postprocess(input_row, test_category)
-        if (test_category in {'irrelevance', 'live_relevance', 'live_irrelevance'}):
-            score = gen_relevance_acc(score_result)['valid']
+        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
+            score = gen_relevance_acc(score_result)["valid"]
         else:
-            score = gen_valid(score_result)['valid']
+            score = gen_valid(score_result)["valid"]
         return {
             "score": float(score),
         }
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
index f9ae213d8..445cdfc77 100644
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@@ -1,4 +1,4 @@
-#ruff: noqa
+# ruff: noqa
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -71,11 +71,7 @@ def parse_java_function_call(source_code):
                 return f"new {type_text}()"
         elif node.type == "set":
             # Handling sets specifically
-            items = [
-                traverse_node(n, True)
-                for n in node.children
-                if n.type not in [",", "set"]
-            ]
+            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
             return "{" + ", ".join(items) + "}"
 
         elif node.child_count > 0:
@@ -124,9 +120,7 @@ def parse_java_function_call(source_code):
                 arguments = extract_arguments(arguments_node)
                 for key, value in arguments.items():
                     if isinstance(value, list):
-                        raise Exception(
-                            "Error: Multiple arguments with the same name are not supported."
-                        )
+                        raise Exception("Error: Multiple arguments with the same name are not supported.")
                 return [{function_name: arguments}]
 
         else:
@@ -157,9 +151,7 @@ def parse_javascript_function_call(source_code):
                 # Extract left (name) and right (value) parts of the assignment
                 name = child.children[0].text.decode("utf-8")
                 value = child.children[2].text.decode("utf-8")
-                if (value.startswith('"') and value.endswith('"')) or (
-                    value.startswith("'") and value.endswith("'")
-                ):
+                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
                     value = value[1:-1]  # Trim the quotation marks
                 if name in args:
                     if not isinstance(args[name], list):
@@ -190,9 +182,7 @@ def parse_javascript_function_call(source_code):
                         parameters = extract_arguments(arguments_node)
                         for key, value in parameters.items():
                             if isinstance(value, list):
-                                raise Exception(
-                                    "Error: Multiple arguments with the same name are not supported."
-                                )
+                                raise Exception("Error: Multiple arguments with the same name are not supported.")
                         result = [{function_name: parameters}]
                         return result
 
@@ -209,9 +199,7 @@ def ast_parse(input_str, language="Python"):
                 extracted.append(resolve_ast_call(elem))
         return extracted
     elif language == "Java":
-        return parse_java_function_call(
-            input_str[1:-1]
-        )  # Remove the [ and ] from the string
+        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
     elif language == "JavaScript":
         return parse_javascript_function_call(input_str[1:-1])
     else:
@@ -254,17 +242,10 @@ def resolve_ast_by_type(value):
     elif isinstance(value, ast.List):
         output = [resolve_ast_by_type(v) for v in value.elts]
     elif isinstance(value, ast.Dict):
-        output = {
-            resolve_ast_by_type(k): resolve_ast_by_type(v)
-            for k, v in zip(value.keys, value.values)
-        }
-    elif isinstance(
-        value, ast.NameConstant
-    ):  # Added this condition to handle boolean values
+        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
+    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
         output = value.value
-    elif isinstance(
-        value, ast.BinOp
-    ):  # Added this condition to handle function calls as arguments
+    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
         output = eval(ast.unparse(value))
     elif isinstance(value, ast.Name):
         output = value.id
@@ -311,7 +292,5 @@ def decode_execute(result):
     execution_list = []
     for function_call in decode_output:
         for key, value in function_call.items():
-            execution_list.append(
-                f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})"
-            )
+            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
     return execution_list
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
index e01eed882..f6aab123c 100644
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@@ -1,4 +1,4 @@
-#ruff: noqa
+# ruff: noqa
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -220,9 +220,7 @@ def list_checker(param: str, model_output: list, possible_answer: list):
         standardize_possible_answer.append([])
         for j in range(len(possible_answer[i])):
             if type(possible_answer[i][j]) == str:
-                standardize_possible_answer[i].append(
-                    standardize_string(possible_answer[i][j])
-                )
+                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
             else:
                 standardize_possible_answer[i].append(possible_answer[i][j])
 
@@ -244,7 +242,6 @@ def dict_checker(param: str, model_output: dict, possible_answers: list):
 
     result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
     for i in range(len(possible_answers)):
-
         if possible_answers[i] == "":
             continue
 
@@ -272,9 +269,7 @@ def dict_checker(param: str, model_output: dict, possible_answers: list):
             standardize_possible_answer = []
             for i in range(len(possible_answer[key])):
                 if type(possible_answer[key][i]) == str:
-                    standardize_possible_answer.append(
-                        standardize_string(possible_answer[key][i])
-                    )
+                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
                 else:
                     standardize_possible_answer.append(possible_answer[key][i])
 
@@ -353,7 +348,6 @@ def simple_function_checker(
         "error_type": "simple_function_checker:unclear",
     }
 
-
     # Check if function name matches
     if func_name not in model_output:
         result["valid"] = False
@@ -403,9 +397,7 @@ def simple_function_checker(
                 if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
                     nested_type = param_details[param]["items"]["type"]
                     nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
-                    value = java_type_converter(
-                        value, expected_type_description, nested_type
-                    )
+                    value = java_type_converter(value, expected_type_description, nested_type)
                 else:
                     value = java_type_converter(value, expected_type_description)
 
@@ -426,9 +418,7 @@ def simple_function_checker(
                 if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
                     nested_type = param_details[param]["items"]["type"]
                     nested_type_converted = JS_TYPE_CONVERSION[nested_type]
-                    value = js_type_converter(
-                        value, expected_type_description, nested_type
-                    )
+                    value = js_type_converter(value, expected_type_description, nested_type)
                 else:
                     value = js_type_converter(value, expected_type_description)
 
@@ -445,11 +435,7 @@ def simple_function_checker(
             value = list(value)
 
         # Allow python auto conversion from int to float
-        if (
-            language == "Python"
-            and expected_type_description == "float"
-            and type(value) == int
-        ):
+        if language == "Python" and expected_type_description == "float" and type(value) == int:
             value = float(value)
 
         # Type checking
@@ -609,9 +595,7 @@ def parallel_function_checker_no_order(
                 )
 
         if not result["valid"]:
-            considered_indices = [
-                i for i in range(len(model_output)) if i not in matched_indices
-            ]
+            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
             all_errors.insert(
                 0,
                 f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
@@ -782,9 +766,7 @@ def executable_checker_simple(
 
     else:
         # structural match
-        pattern_match_result = patten_matcher(
-            exec_output, expected_result, function_call, is_sanity_check
-        )
+        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
         if not pattern_match_result["valid"]:
             return pattern_match_result
 
@@ -794,7 +776,6 @@ def executable_checker_simple(
 def executable_checker_parallel_no_order(
     decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
 ):
-
     if len(decoded_result) != len(expected_exec_result):
         return {
             "valid": False,
@@ -828,18 +809,14 @@ def executable_checker_parallel_no_order(
                             "sub_error": result["error"],
                             "sub_error_type": result["error_type"],
                             "model_executed_output": (
-                                result["model_executed_output"]
-                                if "model_executed_output" in result
-                                else None
+                                result["model_executed_output"] if "model_executed_output" in result else None
                             ),
                         }
                     }
                 )
 
         if not result["valid"]:
-            considered_indices = [
-                i for i in range(len(decoded_result)) if i not in matched_indices
-            ]
+            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
             all_errors.insert(
                 0,
                 f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
@@ -874,7 +851,6 @@ def executable_checker_rest(func_call, idx):
 
     try:
         if response.status_code == 200:
-
             eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
             try:
                 if isinstance(eval_GT_json, dict):
@@ -888,9 +864,7 @@ def executable_checker_rest(func_call, idx):
                         }
                     return {
                         "valid": False,
-                        "error": [
-                            f"Expected dictionary, but got {type(response.json())}"
-                        ],
+                        "error": [f"Expected dictionary, but got {type(response.json())}"],
                         "error_type": "executable_checker_rest:wrong_type",
                     }
 
@@ -905,9 +879,7 @@ def executable_checker_rest(func_call, idx):
 
                         else:
                             for i in range(len(eval_GT_json)):
-                                if set(eval_GT_json[i].keys()) != set(
-                                    response.json()[i].keys()
-                                ):
+                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
                                     return {
                                         "valid": False,
                                         "error": [f"Key inconsistency"],
@@ -918,16 +890,12 @@ def executable_checker_rest(func_call, idx):
                     else:
                         return {
                             "valid": False,
-                            "error": [
-                                f"Expected list, but got {type(response.json())}"
-                            ],
+                            "error": [f"Expected list, but got {type(response.json())}"],
                             "error_type": "executable_checker_rest:wrong_type",
                         }
                 return {
                     "valid": False,
-                    "error": [
-                        f"Expected dict or list, but got {type(response.json())}"
-                    ],
+                    "error": [f"Expected dict or list, but got {type(response.json())}"],
                     "error_type": "executable_checker_rest:wrong_type",
                 }
             except Exception as e:
@@ -941,9 +909,7 @@ def executable_checker_rest(func_call, idx):
         else:
             return {
                 "valid": False,
-                "error": [
-                    f"Execution result status code is not 200, got {response.status_code}"
-                ],
+                "error": [f"Execution result status code is not 200, got {response.status_code}"],
                 "error_type": "executable_checker_rest:wrong_status_code",
             }
     except Exception as e:
@@ -954,18 +920,12 @@ def executable_checker_rest(func_call, idx):
         }
 
 
-def ast_checker(
-    func_description, model_output, possible_answer, language, test_category, model_name
-):
+def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
     if "parallel" in test_category:
-        return parallel_function_checker_no_order(
-            func_description, model_output, possible_answer, language, model_name
-        )
+        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
 
     elif "multiple" in test_category:
-        return multiple_function_checker(
-            func_description, model_output, possible_answer, language, model_name
-        )
+        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
 
     else:
         if len(model_output) != 1:
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 5ef25435b..6961f8022 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -216,6 +216,24 @@ datasets:
     split: test
   dataset_id: math_500
   provider_id: huggingface
+- dataset_schema:
+    function:
+      type: string
+    language:
+      type: string
+    ground_truth:
+      type: string
+    id:
+      type: string
+    chat_completion_input:
+      type: string
+  url:
+    uri: https://huggingface.co/datasets/llamastack/bfcl_v3
+  metadata:
+    path: llamastack/bfcl_v3
+    split: train
+  dataset_id: bfcl
+  provider_id: huggingface
 scoring_fns: []
 benchmarks:
 - dataset_id: simpleqa
@@ -238,6 +256,11 @@ benchmarks:
   - basic::regex_parser_math_response
   metadata: {}
   benchmark_id: meta-reference-math-500
+- dataset_id: bfcl
+  scoring_functions:
+  - basic::bfcl
+  metadata: {}
+  benchmark_id: meta-reference-bfcl
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search