This commit is contained in:
Xi Yan 2025-03-18 20:39:38 -07:00
parent cb343aa25c
commit 9a660a934b
6 changed files with 27 additions and 55 deletions

View file

@ -8548,7 +8548,7 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"title": "EvaluationTask", "title": "EvaluationTask",
"description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders." "description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
}, },
"GradeRequest": { "GradeRequest": {
"type": "object", "type": "object",

View file

@ -5927,7 +5927,7 @@ components:
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when
you have a curated dataset and have settled on the graders. - `dataset_id` you have a curated dataset and have settled on the graders. - `dataset_id`
and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids.
Use this when you have datasets and / or are iterating on your graders. - Use this when you have datasets and / or are iterating on your graders. -
`data_source` and `grader_ids`: Run evaluation task against a data source `data_source` and `grader_ids`: Run evaluation task against a data source
(e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are
early in your evaluation cycle and experimenting much more with your data early in your evaluation cycle and experimenting much more with your data

View file

@ -52,7 +52,7 @@ class EvaluationTask(BaseModel):
""" """
A task for evaluation. To specify a task, one of the following must be provided: A task for evaluation. To specify a task, one of the following must be provided:
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.
:param benchmark_id: The benchmark ID to evaluate. :param benchmark_id: The benchmark ID to evaluate.

View file

@ -105,9 +105,7 @@ class CommonRoutingTableImpl(RoutingTable):
self.dist_registry = dist_registry self.dist_registry = dist_registry
async def initialize(self) -> None: async def initialize(self) -> None:
async def add_objects( async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, cls) -> None:
objs: List[RoutableObjectWithProvider], provider_id: str, cls
) -> None:
for obj in objs: for obj in objs:
if cls is None: if cls is None:
obj.provider_id = provider_id obj.provider_id = provider_id
@ -142,9 +140,7 @@ class CommonRoutingTableImpl(RoutingTable):
for p in self.impls_by_provider_id.values(): for p in self.impls_by_provider_id.values():
await p.shutdown() await p.shutdown()
def get_provider_impl( def get_provider_impl(self, routing_key: str, provider_id: Optional[str] = None) -> Any:
self, routing_key: str, provider_id: Optional[str] = None
) -> Any:
def apiname_object(): def apiname_object():
if isinstance(self, ModelsRoutingTable): if isinstance(self, ModelsRoutingTable):
return ("Inference", "model") return ("Inference", "model")
@ -182,9 +178,7 @@ class CommonRoutingTableImpl(RoutingTable):
raise ValueError(f"Provider not found for `{routing_key}`") raise ValueError(f"Provider not found for `{routing_key}`")
async def get_object_by_identifier( async def get_object_by_identifier(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
self, type: str, identifier: str
) -> Optional[RoutableObjectWithProvider]:
# Get from disk registry # Get from disk registry
obj = await self.dist_registry.get(type, identifier) obj = await self.dist_registry.get(type, identifier)
if not obj: if not obj:
@ -194,13 +188,9 @@ class CommonRoutingTableImpl(RoutingTable):
async def unregister_object(self, obj: RoutableObjectWithProvider) -> None: async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
await self.dist_registry.delete(obj.type, obj.identifier) await self.dist_registry.delete(obj.type, obj.identifier)
await unregister_object_from_provider( await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id])
obj, self.impls_by_provider_id[obj.provider_id]
)
async def register_object( async def register_object(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider:
self, obj: RoutableObjectWithProvider
) -> RoutableObjectWithProvider:
# if provider_id is not specified, pick an arbitrary one from existing entries # if provider_id is not specified, pick an arbitrary one from existing entries
if not obj.provider_id and len(self.impls_by_provider_id) > 0: if not obj.provider_id and len(self.impls_by_provider_id) > 0:
obj.provider_id = list(self.impls_by_provider_id.keys())[0] obj.provider_id = list(self.impls_by_provider_id.keys())[0]
@ -258,9 +248,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
if model_type is None: if model_type is None:
model_type = ModelType.llm model_type = ModelType.llm
if "embedding_dimension" not in metadata and model_type == ModelType.embedding: if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
raise ValueError( raise ValueError("Embedding model must have an embedding dimension in its metadata")
"Embedding model must have an embedding dimension in its metadata"
)
model = Model( model = Model(
identifier=model_id, identifier=model_id,
provider_resource_id=provider_model_id, provider_resource_id=provider_model_id,
@ -280,9 +268,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
class ShieldsRoutingTable(CommonRoutingTableImpl, Shields): class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
async def list_shields(self) -> ListShieldsResponse: async def list_shields(self) -> ListShieldsResponse:
return ListShieldsResponse( return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value))
data=await self.get_all_with_type(ResourceType.shield.value)
)
async def get_shield(self, identifier: str) -> Shield: async def get_shield(self, identifier: str) -> Shield:
shield = await self.get_object_by_identifier("shield", identifier) shield = await self.get_object_by_identifier("shield", identifier)
@ -347,18 +333,14 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}." f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
) )
else: else:
raise ValueError( raise ValueError("No provider available. Please configure a vector_io provider.")
"No provider available. Please configure a vector_io provider."
)
model = await self.get_object_by_identifier("model", embedding_model) model = await self.get_object_by_identifier("model", embedding_model)
if model is None: if model is None:
raise ValueError(f"Model {embedding_model} not found") raise ValueError(f"Model {embedding_model} not found")
if model.model_type != ModelType.embedding: if model.model_type != ModelType.embedding:
raise ValueError(f"Model {embedding_model} is not an embedding model") raise ValueError(f"Model {embedding_model} is not an embedding model")
if "embedding_dimension" not in model.metadata: if "embedding_dimension" not in model.metadata:
raise ValueError( raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
f"Model {embedding_model} does not have an embedding dimension"
)
vector_db_data = { vector_db_data = {
"identifier": vector_db_id, "identifier": vector_db_id,
"type": ResourceType.vector_db.value, "type": ResourceType.vector_db.value,
@ -380,9 +362,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
async def list_datasets(self) -> ListDatasetsResponse: async def list_datasets(self) -> ListDatasetsResponse:
return ListDatasetsResponse( return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
data=await self.get_all_with_type(ResourceType.dataset.value)
)
async def get_dataset(self, dataset_id: str) -> Dataset: async def get_dataset(self, dataset_id: str) -> Dataset:
dataset = await self.get_object_by_identifier("dataset", dataset_id) dataset = await self.get_object_by_identifier("dataset", dataset_id)
@ -438,14 +418,10 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions): class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
async def list_scoring_functions(self) -> ListScoringFunctionsResponse: async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
return ListScoringFunctionsResponse( return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
data=await self.get_all_with_type(ResourceType.scoring_function.value)
)
async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
scoring_fn = await self.get_object_by_identifier( scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id)
"scoring_function", scoring_fn_id
)
if scoring_fn is None: if scoring_fn is None:
raise ValueError(f"Scoring function '{scoring_fn_id}' not found") raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
return scoring_fn return scoring_fn
@ -554,12 +530,8 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
args: Optional[Dict[str, Any]] = None, args: Optional[Dict[str, Any]] = None,
) -> None: ) -> None:
tools = [] tools = []
tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools( tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
toolgroup_id, mcp_endpoint tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
)
tool_host = (
ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
)
for tool_def in tool_defs: for tool_def in tool_defs:
tools.append( tools.append(

View file

@ -214,27 +214,27 @@ def get_distribution_template() -> DistributionTemplate:
BenchmarkInput( BenchmarkInput(
benchmark_id="meta-reference-simpleqa", benchmark_id="meta-reference-simpleqa",
dataset_id="simpleqa", dataset_id="simpleqa",
scoring_functions=["llm-as-judge::405b-simpleqa"], grader_ids=["llm-as-judge::405b-simpleqa"],
), ),
BenchmarkInput( BenchmarkInput(
benchmark_id="meta-reference-mmlu-cot", benchmark_id="meta-reference-mmlu-cot",
dataset_id="mmlu_cot", dataset_id="mmlu_cot",
scoring_functions=["basic::regex_parser_multiple_choice_answer"], grader_ids=["basic::regex_parser_multiple_choice_answer"],
), ),
BenchmarkInput( BenchmarkInput(
benchmark_id="meta-reference-gpqa-cot", benchmark_id="meta-reference-gpqa-cot",
dataset_id="gpqa_cot", dataset_id="gpqa_cot",
scoring_functions=["basic::regex_parser_multiple_choice_answer"], grader_ids=["basic::regex_parser_multiple_choice_answer"],
), ),
BenchmarkInput( BenchmarkInput(
benchmark_id="meta-reference-math-500", benchmark_id="meta-reference-math-500",
dataset_id="math_500", dataset_id="math_500",
scoring_functions=["basic::regex_parser_math_response"], grader_ids=["basic::regex_parser_math_response"],
), ),
BenchmarkInput( BenchmarkInput(
benchmark_id="meta-reference-bfcl", benchmark_id="meta-reference-bfcl",
dataset_id="bfcl", dataset_id="bfcl",
scoring_functions=["basic::bfcl"], grader_ids=["basic::bfcl"],
), ),
] ]
return DistributionTemplate( return DistributionTemplate(

View file

@ -196,27 +196,27 @@ datasets:
scoring_fns: [] scoring_fns: []
benchmarks: benchmarks:
- dataset_id: simpleqa - dataset_id: simpleqa
scoring_functions: grader_ids:
- llm-as-judge::405b-simpleqa - llm-as-judge::405b-simpleqa
metadata: {} metadata: {}
benchmark_id: meta-reference-simpleqa benchmark_id: meta-reference-simpleqa
- dataset_id: mmlu_cot - dataset_id: mmlu_cot
scoring_functions: grader_ids:
- basic::regex_parser_multiple_choice_answer - basic::regex_parser_multiple_choice_answer
metadata: {} metadata: {}
benchmark_id: meta-reference-mmlu-cot benchmark_id: meta-reference-mmlu-cot
- dataset_id: gpqa_cot - dataset_id: gpqa_cot
scoring_functions: grader_ids:
- basic::regex_parser_multiple_choice_answer - basic::regex_parser_multiple_choice_answer
metadata: {} metadata: {}
benchmark_id: meta-reference-gpqa-cot benchmark_id: meta-reference-gpqa-cot
- dataset_id: math_500 - dataset_id: math_500
scoring_functions: grader_ids:
- basic::regex_parser_math_response - basic::regex_parser_math_response
metadata: {} metadata: {}
benchmark_id: meta-reference-math-500 benchmark_id: meta-reference-math-500
- dataset_id: bfcl - dataset_id: bfcl
scoring_functions: grader_ids:
- basic::bfcl - basic::bfcl
metadata: {} metadata: {}
benchmark_id: meta-reference-bfcl benchmark_id: meta-reference-bfcl