Remove bfcl from scoring functions

2025-12-18 16:49:47 +00:00 · 2025-08-29 10:37:44 -07:00 · 2025-08-29 10:37:44 -07:00 · a379b8d98c
commit a379b8d98c
parent 3130ca0a78
10 changed files with 3 additions and 1473 deletions
--- a/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/llama_stack/distributions/open-benchmark/open_benchmark.py
@ -53,7 +53,7 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
            "anthropic",
            [
                ProviderModelEntry(
-                    provider_model_id="anthropic/claude-3-5-sonnet-latest",
+                    provider_model_id="claude-3-5-sonnet-latest",
                    model_type=ModelType.llm,
                )
            ],
@ -206,13 +206,6 @@ def get_distribution_template() -> DistributionTemplate:
                uri="huggingface://datasets/llamastack/math_500?split=test",
            ),
        ),
-        DatasetInput(
-            dataset_id="bfcl",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
-            ),
-        ),
        DatasetInput(
            dataset_id="ifeval",
            purpose=DatasetPurpose.eval_messages_answer,
@ -250,11 +243,6 @@ def get_distribution_template() -> DistributionTemplate:
            dataset_id="math_500",
            scoring_functions=["basic::regex_parser_math_response"],
        ),
-        BenchmarkInput(
-            benchmark_id="meta-reference-bfcl",
-            dataset_id="bfcl",
-            scoring_functions=["basic::bfcl"],
-        ),
        BenchmarkInput(
            benchmark_id="meta-reference-ifeval",
            dataset_id="ifeval",
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@ -141,9 +141,9 @@ models:
  provider_model_id: openai/gpt-4o
  model_type: llm
 - metadata: {}
-  model_id: anthropic/claude-3-5-sonnet-latest
+  model_id: claude-3-5-sonnet-latest
  provider_id: anthropic
-  provider_model_id: anthropic/claude-3-5-sonnet-latest
+  provider_model_id: claude-3-5-sonnet-latest
  model_type: llm
 - metadata: {}
  model_id: gemini/gemini-1.5-flash
@ -188,12 +188,6 @@ datasets:
    uri: huggingface://datasets/llamastack/math_500?split=test
  metadata: {}
  dataset_id: math_500
- purpose: eval/messages-answer
-  source:
-    type: uri
-    uri: huggingface://datasets/llamastack/bfcl_v3?split=train
-  metadata: {}
-  dataset_id: bfcl
 - purpose: eval/messages-answer
  source:
    type: uri
@ -228,11 +222,6 @@ benchmarks:
  - basic::regex_parser_math_response
  metadata: {}
  benchmark_id: meta-reference-math-500
- dataset_id: bfcl
-  scoring_functions:
-  - basic::bfcl
-  metadata: {}
-  benchmark_id: meta-reference-bfcl
 - dataset_id: ifeval
  scoring_functions:
  - basic::ifeval