feat(eval api): (2.1/n) fix resolver for benchmark routing table + fix precommit (#1691)

# What does this PR do? - fixes routing table so that `llama stack run` works - fixes pre-commit - one of many fixes to address implementation fix [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` llama stack run ``` [//]: # (## Documentation)
2025-03-18 21:09:49 -07:00 · 2025-03-18 21:09:49 -07:00 · 08c0c5505e
commit 08c0c5505e
parent bf135f38b1
4 changed files with 31 additions and 26 deletions
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -214,27 +214,27 @@ def get_distribution_template() -> DistributionTemplate:
        BenchmarkInput(
            benchmark_id="meta-reference-simpleqa",
            dataset_id="simpleqa",
-            scoring_functions=["llm-as-judge::405b-simpleqa"],
+            grader_ids=["llm-as-judge::405b-simpleqa"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-mmlu-cot",
            dataset_id="mmlu_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
+            grader_ids=["basic::regex_parser_multiple_choice_answer"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-gpqa-cot",
            dataset_id="gpqa_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
+            grader_ids=["basic::regex_parser_multiple_choice_answer"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-math-500",
            dataset_id="math_500",
-            scoring_functions=["basic::regex_parser_math_response"],
+            grader_ids=["basic::regex_parser_math_response"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-bfcl",
            dataset_id="bfcl",
-            scoring_functions=["basic::bfcl"],
+            grader_ids=["basic::bfcl"],
        ),
    ]
    return DistributionTemplate(
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -196,27 +196,27 @@ datasets:
 scoring_fns: []
 benchmarks:
 - dataset_id: simpleqa
-  scoring_functions:
+  grader_ids:
  - llm-as-judge::405b-simpleqa
  metadata: {}
  benchmark_id: meta-reference-simpleqa
 - dataset_id: mmlu_cot
-  scoring_functions:
+  grader_ids:
  - basic::regex_parser_multiple_choice_answer
  metadata: {}
  benchmark_id: meta-reference-mmlu-cot
 - dataset_id: gpqa_cot
-  scoring_functions:
+  grader_ids:
  - basic::regex_parser_multiple_choice_answer
  metadata: {}
  benchmark_id: meta-reference-gpqa-cot
 - dataset_id: math_500
-  scoring_functions:
+  grader_ids:
  - basic::regex_parser_math_response
  metadata: {}
  benchmark_id: meta-reference-math-500
 - dataset_id: bfcl
-  scoring_functions:
+  grader_ids:
  - basic::bfcl
  metadata: {}
  benchmark_id: meta-reference-bfcl