From 9a660a934bf418821fe49862c1f679ea08d865cb Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 18 Mar 2025 20:39:38 -0700 Subject: [PATCH] fix --- docs/_static/llama-stack-spec.html | 2 +- docs/_static/llama-stack-spec.yaml | 2 +- llama_stack/apis/evaluation/evaluation.py | 2 +- .../distribution/routers/routing_tables.py | 56 +++++-------------- .../open-benchmark/open_benchmark.py | 10 ++-- llama_stack/templates/open-benchmark/run.yaml | 10 ++-- 6 files changed, 27 insertions(+), 55 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 09d4cb805..0f223b51b 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -8548,7 +8548,7 @@ }, "additionalProperties": false, "title": "EvaluationTask", - "description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders." + "description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders." }, "GradeRequest": { "type": "object", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 72361c50e..7c4ea81b8 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -5927,7 +5927,7 @@ components: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. - Use this when you have datasets and / or are iterating on your graders. - + Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py index e1f02dbae..269004b26 100644 --- a/llama_stack/apis/evaluation/evaluation.py +++ b/llama_stack/apis/evaluation/evaluation.py @@ -52,7 +52,7 @@ class EvaluationTask(BaseModel): """ A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. + - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders. :param benchmark_id: The benchmark ID to evaluate. diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 788bdbac5..9c27b4058 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -105,9 +105,7 @@ class CommonRoutingTableImpl(RoutingTable): self.dist_registry = dist_registry async def initialize(self) -> None: - async def add_objects( - objs: List[RoutableObjectWithProvider], provider_id: str, cls - ) -> None: + async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, cls) -> None: for obj in objs: if cls is None: obj.provider_id = provider_id @@ -142,9 +140,7 @@ class CommonRoutingTableImpl(RoutingTable): for p in self.impls_by_provider_id.values(): await p.shutdown() - def get_provider_impl( - self, routing_key: str, provider_id: Optional[str] = None - ) -> Any: + def get_provider_impl(self, routing_key: str, provider_id: Optional[str] = None) -> Any: def apiname_object(): if isinstance(self, ModelsRoutingTable): return ("Inference", "model") @@ -182,9 +178,7 @@ class CommonRoutingTableImpl(RoutingTable): raise ValueError(f"Provider not found for `{routing_key}`") - async def get_object_by_identifier( - self, type: str, identifier: str - ) -> Optional[RoutableObjectWithProvider]: + async def get_object_by_identifier(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]: # Get from disk registry obj = await self.dist_registry.get(type, identifier) if not obj: @@ -194,13 +188,9 @@ class CommonRoutingTableImpl(RoutingTable): async def unregister_object(self, obj: RoutableObjectWithProvider) -> None: await self.dist_registry.delete(obj.type, obj.identifier) - await unregister_object_from_provider( - obj, self.impls_by_provider_id[obj.provider_id] - ) + await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id]) - async def register_object( - self, obj: RoutableObjectWithProvider - ) -> RoutableObjectWithProvider: + async def register_object(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider: # if provider_id is not specified, pick an arbitrary one from existing entries if not obj.provider_id and len(self.impls_by_provider_id) > 0: obj.provider_id = list(self.impls_by_provider_id.keys())[0] @@ -258,9 +248,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): if model_type is None: model_type = ModelType.llm if "embedding_dimension" not in metadata and model_type == ModelType.embedding: - raise ValueError( - "Embedding model must have an embedding dimension in its metadata" - ) + raise ValueError("Embedding model must have an embedding dimension in its metadata") model = Model( identifier=model_id, provider_resource_id=provider_model_id, @@ -280,9 +268,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): class ShieldsRoutingTable(CommonRoutingTableImpl, Shields): async def list_shields(self) -> ListShieldsResponse: - return ListShieldsResponse( - data=await self.get_all_with_type(ResourceType.shield.value) - ) + return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value)) async def get_shield(self, identifier: str) -> Shield: shield = await self.get_object_by_identifier("shield", identifier) @@ -347,18 +333,14 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs): f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}." ) else: - raise ValueError( - "No provider available. Please configure a vector_io provider." - ) + raise ValueError("No provider available. Please configure a vector_io provider.") model = await self.get_object_by_identifier("model", embedding_model) if model is None: raise ValueError(f"Model {embedding_model} not found") if model.model_type != ModelType.embedding: raise ValueError(f"Model {embedding_model} is not an embedding model") if "embedding_dimension" not in model.metadata: - raise ValueError( - f"Model {embedding_model} does not have an embedding dimension" - ) + raise ValueError(f"Model {embedding_model} does not have an embedding dimension") vector_db_data = { "identifier": vector_db_id, "type": ResourceType.vector_db.value, @@ -380,9 +362,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs): class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): async def list_datasets(self) -> ListDatasetsResponse: - return ListDatasetsResponse( - data=await self.get_all_with_type(ResourceType.dataset.value) - ) + return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value)) async def get_dataset(self, dataset_id: str) -> Dataset: dataset = await self.get_object_by_identifier("dataset", dataset_id) @@ -438,14 +418,10 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions): async def list_scoring_functions(self) -> ListScoringFunctionsResponse: - return ListScoringFunctionsResponse( - data=await self.get_all_with_type(ResourceType.scoring_function.value) - ) + return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value)) async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: - scoring_fn = await self.get_object_by_identifier( - "scoring_function", scoring_fn_id - ) + scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id) if scoring_fn is None: raise ValueError(f"Scoring function '{scoring_fn_id}' not found") return scoring_fn @@ -554,12 +530,8 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): args: Optional[Dict[str, Any]] = None, ) -> None: tools = [] - tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools( - toolgroup_id, mcp_endpoint - ) - tool_host = ( - ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution - ) + tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint) + tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution for tool_def in tool_defs: tools.append( diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index b339e8c80..03e524dae 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -214,27 +214,27 @@ def get_distribution_template() -> DistributionTemplate: BenchmarkInput( benchmark_id="meta-reference-simpleqa", dataset_id="simpleqa", - scoring_functions=["llm-as-judge::405b-simpleqa"], + grader_ids=["llm-as-judge::405b-simpleqa"], ), BenchmarkInput( benchmark_id="meta-reference-mmlu-cot", dataset_id="mmlu_cot", - scoring_functions=["basic::regex_parser_multiple_choice_answer"], + grader_ids=["basic::regex_parser_multiple_choice_answer"], ), BenchmarkInput( benchmark_id="meta-reference-gpqa-cot", dataset_id="gpqa_cot", - scoring_functions=["basic::regex_parser_multiple_choice_answer"], + grader_ids=["basic::regex_parser_multiple_choice_answer"], ), BenchmarkInput( benchmark_id="meta-reference-math-500", dataset_id="math_500", - scoring_functions=["basic::regex_parser_math_response"], + grader_ids=["basic::regex_parser_math_response"], ), BenchmarkInput( benchmark_id="meta-reference-bfcl", dataset_id="bfcl", - scoring_functions=["basic::bfcl"], + grader_ids=["basic::bfcl"], ), ] return DistributionTemplate( diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 93f437273..a3c00af56 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -196,27 +196,27 @@ datasets: scoring_fns: [] benchmarks: - dataset_id: simpleqa - scoring_functions: + grader_ids: - llm-as-judge::405b-simpleqa metadata: {} benchmark_id: meta-reference-simpleqa - dataset_id: mmlu_cot - scoring_functions: + grader_ids: - basic::regex_parser_multiple_choice_answer metadata: {} benchmark_id: meta-reference-mmlu-cot - dataset_id: gpqa_cot - scoring_functions: + grader_ids: - basic::regex_parser_multiple_choice_answer metadata: {} benchmark_id: meta-reference-gpqa-cot - dataset_id: math_500 - scoring_functions: + grader_ids: - basic::regex_parser_math_response metadata: {} benchmark_id: meta-reference-math-500 - dataset_id: bfcl - scoring_functions: + grader_ids: - basic::bfcl metadata: {} benchmark_id: meta-reference-bfcl