diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 09d4cb805..0f223b51b 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -8548,7 +8548,7 @@
},
"additionalProperties": false,
"title": "EvaluationTask",
- "description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
+ "description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
},
"GradeRequest": {
"type": "object",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 72361c50e..7c4ea81b8 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -5927,7 +5927,7 @@ components:
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when
you have a curated dataset and have settled on the graders. - `dataset_id`
and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids.
- Use this when you have datasets and / or are iterating on your graders. -
+ Use this when you have datasets and / or are iterating on your graders. -
`data_source` and `grader_ids`: Run evaluation task against a data source
(e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are
early in your evaluation cycle and experimenting much more with your data
diff --git a/llama_stack/apis/evaluation/evaluation.py b/llama_stack/apis/evaluation/evaluation.py
index e1f02dbae..269004b26 100644
--- a/llama_stack/apis/evaluation/evaluation.py
+++ b/llama_stack/apis/evaluation/evaluation.py
@@ -52,7 +52,7 @@ class EvaluationTask(BaseModel):
"""
A task for evaluation. To specify a task, one of the following must be provided:
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
- - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
+ - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.
:param benchmark_id: The benchmark ID to evaluate.
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 788bdbac5..9c27b4058 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -105,9 +105,7 @@ class CommonRoutingTableImpl(RoutingTable):
self.dist_registry = dist_registry
async def initialize(self) -> None:
- async def add_objects(
- objs: List[RoutableObjectWithProvider], provider_id: str, cls
- ) -> None:
+ async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, cls) -> None:
for obj in objs:
if cls is None:
obj.provider_id = provider_id
@@ -142,9 +140,7 @@ class CommonRoutingTableImpl(RoutingTable):
for p in self.impls_by_provider_id.values():
await p.shutdown()
- def get_provider_impl(
- self, routing_key: str, provider_id: Optional[str] = None
- ) -> Any:
+ def get_provider_impl(self, routing_key: str, provider_id: Optional[str] = None) -> Any:
def apiname_object():
if isinstance(self, ModelsRoutingTable):
return ("Inference", "model")
@@ -182,9 +178,7 @@ class CommonRoutingTableImpl(RoutingTable):
raise ValueError(f"Provider not found for `{routing_key}`")
- async def get_object_by_identifier(
- self, type: str, identifier: str
- ) -> Optional[RoutableObjectWithProvider]:
+ async def get_object_by_identifier(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
# Get from disk registry
obj = await self.dist_registry.get(type, identifier)
if not obj:
@@ -194,13 +188,9 @@ class CommonRoutingTableImpl(RoutingTable):
async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
await self.dist_registry.delete(obj.type, obj.identifier)
- await unregister_object_from_provider(
- obj, self.impls_by_provider_id[obj.provider_id]
- )
+ await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id])
- async def register_object(
- self, obj: RoutableObjectWithProvider
- ) -> RoutableObjectWithProvider:
+ async def register_object(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider:
# if provider_id is not specified, pick an arbitrary one from existing entries
if not obj.provider_id and len(self.impls_by_provider_id) > 0:
obj.provider_id = list(self.impls_by_provider_id.keys())[0]
@@ -258,9 +248,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
if model_type is None:
model_type = ModelType.llm
if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
- raise ValueError(
- "Embedding model must have an embedding dimension in its metadata"
- )
+ raise ValueError("Embedding model must have an embedding dimension in its metadata")
model = Model(
identifier=model_id,
provider_resource_id=provider_model_id,
@@ -280,9 +268,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
async def list_shields(self) -> ListShieldsResponse:
- return ListShieldsResponse(
- data=await self.get_all_with_type(ResourceType.shield.value)
- )
+ return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value))
async def get_shield(self, identifier: str) -> Shield:
shield = await self.get_object_by_identifier("shield", identifier)
@@ -347,18 +333,14 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
)
else:
- raise ValueError(
- "No provider available. Please configure a vector_io provider."
- )
+ raise ValueError("No provider available. Please configure a vector_io provider.")
model = await self.get_object_by_identifier("model", embedding_model)
if model is None:
raise ValueError(f"Model {embedding_model} not found")
if model.model_type != ModelType.embedding:
raise ValueError(f"Model {embedding_model} is not an embedding model")
if "embedding_dimension" not in model.metadata:
- raise ValueError(
- f"Model {embedding_model} does not have an embedding dimension"
- )
+ raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
vector_db_data = {
"identifier": vector_db_id,
"type": ResourceType.vector_db.value,
@@ -380,9 +362,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
async def list_datasets(self) -> ListDatasetsResponse:
- return ListDatasetsResponse(
- data=await self.get_all_with_type(ResourceType.dataset.value)
- )
+ return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
async def get_dataset(self, dataset_id: str) -> Dataset:
dataset = await self.get_object_by_identifier("dataset", dataset_id)
@@ -438,14 +418,10 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
- return ListScoringFunctionsResponse(
- data=await self.get_all_with_type(ResourceType.scoring_function.value)
- )
+ return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
- scoring_fn = await self.get_object_by_identifier(
- "scoring_function", scoring_fn_id
- )
+ scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id)
if scoring_fn is None:
raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
return scoring_fn
@@ -554,12 +530,8 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
args: Optional[Dict[str, Any]] = None,
) -> None:
tools = []
- tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(
- toolgroup_id, mcp_endpoint
- )
- tool_host = (
- ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
- )
+ tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
+ tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
for tool_def in tool_defs:
tools.append(
diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py
index b339e8c80..03e524dae 100644
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@@ -214,27 +214,27 @@ def get_distribution_template() -> DistributionTemplate:
BenchmarkInput(
benchmark_id="meta-reference-simpleqa",
dataset_id="simpleqa",
- scoring_functions=["llm-as-judge::405b-simpleqa"],
+ grader_ids=["llm-as-judge::405b-simpleqa"],
),
BenchmarkInput(
benchmark_id="meta-reference-mmlu-cot",
dataset_id="mmlu_cot",
- scoring_functions=["basic::regex_parser_multiple_choice_answer"],
+ grader_ids=["basic::regex_parser_multiple_choice_answer"],
),
BenchmarkInput(
benchmark_id="meta-reference-gpqa-cot",
dataset_id="gpqa_cot",
- scoring_functions=["basic::regex_parser_multiple_choice_answer"],
+ grader_ids=["basic::regex_parser_multiple_choice_answer"],
),
BenchmarkInput(
benchmark_id="meta-reference-math-500",
dataset_id="math_500",
- scoring_functions=["basic::regex_parser_math_response"],
+ grader_ids=["basic::regex_parser_math_response"],
),
BenchmarkInput(
benchmark_id="meta-reference-bfcl",
dataset_id="bfcl",
- scoring_functions=["basic::bfcl"],
+ grader_ids=["basic::bfcl"],
),
]
return DistributionTemplate(
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 93f437273..a3c00af56 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -196,27 +196,27 @@ datasets:
scoring_fns: []
benchmarks:
- dataset_id: simpleqa
- scoring_functions:
+ grader_ids:
- llm-as-judge::405b-simpleqa
metadata: {}
benchmark_id: meta-reference-simpleqa
- dataset_id: mmlu_cot
- scoring_functions:
+ grader_ids:
- basic::regex_parser_multiple_choice_answer
metadata: {}
benchmark_id: meta-reference-mmlu-cot
- dataset_id: gpqa_cot
- scoring_functions:
+ grader_ids:
- basic::regex_parser_multiple_choice_answer
metadata: {}
benchmark_id: meta-reference-gpqa-cot
- dataset_id: math_500
- scoring_functions:
+ grader_ids:
- basic::regex_parser_math_response
metadata: {}
benchmark_id: meta-reference-math-500
- dataset_id: bfcl
- scoring_functions:
+ grader_ids:
- basic::bfcl
metadata: {}
benchmark_id: meta-reference-bfcl