fix

2025-12-30 22:30:01 +00:00 · 2025-03-18 20:39:38 -07:00 · 2025-03-18 20:39:38 -07:00 · 9a660a934b
commit 9a660a934b
parent cb343aa25c
6 changed files with 27 additions and 55 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -8548,7 +8548,7 @@
                },
                "additionalProperties": false,
                "title": "EvaluationTask",
-                "description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.  - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
+                "description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
            },
            "GradeRequest": {
                "type": "object",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -5927,7 +5927,7 @@ components:
        - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when
        you have a curated dataset and have settled on the graders. - `dataset_id`
        and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids.
-        Use this when you have datasets and / or are iterating on your graders.  -
+        Use this when you have datasets and / or are iterating on your graders. -
        `data_source` and `grader_ids`: Run evaluation task against a data source
        (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are
        early in your evaluation cycle and experimenting much more with your data
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@ -52,7 +52,7 @@ class EvaluationTask(BaseModel):
    """
    A task for evaluation. To specify a task, one of the following must be provided:
    - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
-    - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. 
+    - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
    - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.

    :param benchmark_id: The benchmark ID to evaluate.
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -105,9 +105,7 @@ class CommonRoutingTableImpl(RoutingTable):
        self.dist_registry = dist_registry

    async def initialize(self) -> None:
-        async def add_objects(
-            objs: List[RoutableObjectWithProvider], provider_id: str, cls
-        ) -> None:
+        async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, cls) -> None:
            for obj in objs:
                if cls is None:
                    obj.provider_id = provider_id
@ -142,9 +140,7 @@ class CommonRoutingTableImpl(RoutingTable):
        for p in self.impls_by_provider_id.values():
            await p.shutdown()

-    def get_provider_impl(
-        self, routing_key: str, provider_id: Optional[str] = None
-    ) -> Any:
+    def get_provider_impl(self, routing_key: str, provider_id: Optional[str] = None) -> Any:
        def apiname_object():
            if isinstance(self, ModelsRoutingTable):
                return ("Inference", "model")
@ -182,9 +178,7 @@ class CommonRoutingTableImpl(RoutingTable):

        raise ValueError(f"Provider not found for `{routing_key}`")

-    async def get_object_by_identifier(
-        self, type: str, identifier: str
-    ) -> Optional[RoutableObjectWithProvider]:
+    async def get_object_by_identifier(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
        # Get from disk registry
        obj = await self.dist_registry.get(type, identifier)
        if not obj:
@ -194,13 +188,9 @@ class CommonRoutingTableImpl(RoutingTable):

    async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
        await self.dist_registry.delete(obj.type, obj.identifier)
-        await unregister_object_from_provider(
-            obj, self.impls_by_provider_id[obj.provider_id]
-        )
+        await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id])

-    async def register_object(
-        self, obj: RoutableObjectWithProvider
-    ) -> RoutableObjectWithProvider:
+    async def register_object(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider:
        # if provider_id is not specified, pick an arbitrary one from existing entries
        if not obj.provider_id and len(self.impls_by_provider_id) > 0:
            obj.provider_id = list(self.impls_by_provider_id.keys())[0]
@ -258,9 +248,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
        if model_type is None:
            model_type = ModelType.llm
        if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
-            raise ValueError(
-                "Embedding model must have an embedding dimension in its metadata"
-            )
+            raise ValueError("Embedding model must have an embedding dimension in its metadata")
        model = Model(
            identifier=model_id,
            provider_resource_id=provider_model_id,
@ -280,9 +268,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):

 class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
    async def list_shields(self) -> ListShieldsResponse:
-        return ListShieldsResponse(
-            data=await self.get_all_with_type(ResourceType.shield.value)
-        )
+        return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value))

    async def get_shield(self, identifier: str) -> Shield:
        shield = await self.get_object_by_identifier("shield", identifier)
@ -347,18 +333,14 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
                        f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
                    )
            else:
-                raise ValueError(
-                    "No provider available. Please configure a vector_io provider."
-                )
+                raise ValueError("No provider available. Please configure a vector_io provider.")
        model = await self.get_object_by_identifier("model", embedding_model)
        if model is None:
            raise ValueError(f"Model {embedding_model} not found")
        if model.model_type != ModelType.embedding:
            raise ValueError(f"Model {embedding_model} is not an embedding model")
        if "embedding_dimension" not in model.metadata:
-            raise ValueError(
-                f"Model {embedding_model} does not have an embedding dimension"
-            )
+            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
        vector_db_data = {
            "identifier": vector_db_id,
            "type": ResourceType.vector_db.value,
@ -380,9 +362,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):

 class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
    async def list_datasets(self) -> ListDatasetsResponse:
-        return ListDatasetsResponse(
-            data=await self.get_all_with_type(ResourceType.dataset.value)
-        )
+        return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))

    async def get_dataset(self, dataset_id: str) -> Dataset:
        dataset = await self.get_object_by_identifier("dataset", dataset_id)
@ -438,14 +418,10 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):

 class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
-        return ListScoringFunctionsResponse(
-            data=await self.get_all_with_type(ResourceType.scoring_function.value)
-        )
+        return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))

    async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
-        scoring_fn = await self.get_object_by_identifier(
-            "scoring_function", scoring_fn_id
-        )
+        scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id)
        if scoring_fn is None:
            raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
        return scoring_fn
@ -554,12 +530,8 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
        args: Optional[Dict[str, Any]] = None,
    ) -> None:
        tools = []
-        tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(
-            toolgroup_id, mcp_endpoint
-        )
-        tool_host = (
-            ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
-        )
+        tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
+        tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution

        for tool_def in tool_defs:
            tools.append(
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@ -214,27 +214,27 @@ def get_distribution_template() -> DistributionTemplate:
        BenchmarkInput(
            benchmark_id="meta-reference-simpleqa",
            dataset_id="simpleqa",
-            scoring_functions=["llm-as-judge::405b-simpleqa"],
+            grader_ids=["llm-as-judge::405b-simpleqa"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-mmlu-cot",
            dataset_id="mmlu_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
+            grader_ids=["basic::regex_parser_multiple_choice_answer"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-gpqa-cot",
            dataset_id="gpqa_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
+            grader_ids=["basic::regex_parser_multiple_choice_answer"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-math-500",
            dataset_id="math_500",
-            scoring_functions=["basic::regex_parser_math_response"],
+            grader_ids=["basic::regex_parser_math_response"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-bfcl",
            dataset_id="bfcl",
-            scoring_functions=["basic::bfcl"],
+            grader_ids=["basic::bfcl"],
        ),
    ]
    return DistributionTemplate(
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@ -196,27 +196,27 @@ datasets:
 scoring_fns: []
 benchmarks:
 - dataset_id: simpleqa
-  scoring_functions:
+  grader_ids:
  - llm-as-judge::405b-simpleqa
  metadata: {}
  benchmark_id: meta-reference-simpleqa
 - dataset_id: mmlu_cot
-  scoring_functions:
+  grader_ids:
  - basic::regex_parser_multiple_choice_answer
  metadata: {}
  benchmark_id: meta-reference-mmlu-cot
 - dataset_id: gpqa_cot
-  scoring_functions:
+  grader_ids:
  - basic::regex_parser_multiple_choice_answer
  metadata: {}
  benchmark_id: meta-reference-gpqa-cot
 - dataset_id: math_500
-  scoring_functions:
+  grader_ids:
  - basic::regex_parser_math_response
  metadata: {}
  benchmark_id: meta-reference-math-500
 - dataset_id: bfcl
-  scoring_functions:
+  grader_ids:
  - basic::bfcl
  metadata: {}
  benchmark_id: meta-reference-bfcl