mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-07 11:08:20 +00:00
fix
This commit is contained in:
parent
cb343aa25c
commit
9a660a934b
6 changed files with 27 additions and 55 deletions
2
docs/_static/llama-stack-spec.html
vendored
2
docs/_static/llama-stack-spec.html
vendored
|
@ -8548,7 +8548,7 @@
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"title": "EvaluationTask",
|
"title": "EvaluationTask",
|
||||||
"description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
|
"description": "A task for evaluation. To specify a task, one of the following must be provided: - `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders. - `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders. - `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders."
|
||||||
},
|
},
|
||||||
"GradeRequest": {
|
"GradeRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
|
2
docs/_static/llama-stack-spec.yaml
vendored
2
docs/_static/llama-stack-spec.yaml
vendored
|
@ -5927,7 +5927,7 @@ components:
|
||||||
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when
|
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when
|
||||||
you have a curated dataset and have settled on the graders. - `dataset_id`
|
you have a curated dataset and have settled on the graders. - `dataset_id`
|
||||||
and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids.
|
and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids.
|
||||||
Use this when you have datasets and / or are iterating on your graders. -
|
Use this when you have datasets and / or are iterating on your graders. -
|
||||||
`data_source` and `grader_ids`: Run evaluation task against a data source
|
`data_source` and `grader_ids`: Run evaluation task against a data source
|
||||||
(e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are
|
(e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are
|
||||||
early in your evaluation cycle and experimenting much more with your data
|
early in your evaluation cycle and experimenting much more with your data
|
||||||
|
|
|
@ -52,7 +52,7 @@ class EvaluationTask(BaseModel):
|
||||||
"""
|
"""
|
||||||
A task for evaluation. To specify a task, one of the following must be provided:
|
A task for evaluation. To specify a task, one of the following must be provided:
|
||||||
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
|
- `benchmark_id`: Run evaluation task against a benchmark_id. Use this when you have a curated dataset and have settled on the graders.
|
||||||
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
|
- `dataset_id` and `grader_ids`: Run evaluation task against a dataset_id and a list of grader_ids. Use this when you have datasets and / or are iterating on your graders.
|
||||||
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.
|
- `data_source` and `grader_ids`: Run evaluation task against a data source (e.g. rows, uri, etc.) and a list of grader_ids. Prefer this when you are early in your evaluation cycle and experimenting much more with your data and graders.
|
||||||
|
|
||||||
:param benchmark_id: The benchmark ID to evaluate.
|
:param benchmark_id: The benchmark ID to evaluate.
|
||||||
|
|
|
@ -105,9 +105,7 @@ class CommonRoutingTableImpl(RoutingTable):
|
||||||
self.dist_registry = dist_registry
|
self.dist_registry = dist_registry
|
||||||
|
|
||||||
async def initialize(self) -> None:
|
async def initialize(self) -> None:
|
||||||
async def add_objects(
|
async def add_objects(objs: List[RoutableObjectWithProvider], provider_id: str, cls) -> None:
|
||||||
objs: List[RoutableObjectWithProvider], provider_id: str, cls
|
|
||||||
) -> None:
|
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
if cls is None:
|
if cls is None:
|
||||||
obj.provider_id = provider_id
|
obj.provider_id = provider_id
|
||||||
|
@ -142,9 +140,7 @@ class CommonRoutingTableImpl(RoutingTable):
|
||||||
for p in self.impls_by_provider_id.values():
|
for p in self.impls_by_provider_id.values():
|
||||||
await p.shutdown()
|
await p.shutdown()
|
||||||
|
|
||||||
def get_provider_impl(
|
def get_provider_impl(self, routing_key: str, provider_id: Optional[str] = None) -> Any:
|
||||||
self, routing_key: str, provider_id: Optional[str] = None
|
|
||||||
) -> Any:
|
|
||||||
def apiname_object():
|
def apiname_object():
|
||||||
if isinstance(self, ModelsRoutingTable):
|
if isinstance(self, ModelsRoutingTable):
|
||||||
return ("Inference", "model")
|
return ("Inference", "model")
|
||||||
|
@ -182,9 +178,7 @@ class CommonRoutingTableImpl(RoutingTable):
|
||||||
|
|
||||||
raise ValueError(f"Provider not found for `{routing_key}`")
|
raise ValueError(f"Provider not found for `{routing_key}`")
|
||||||
|
|
||||||
async def get_object_by_identifier(
|
async def get_object_by_identifier(self, type: str, identifier: str) -> Optional[RoutableObjectWithProvider]:
|
||||||
self, type: str, identifier: str
|
|
||||||
) -> Optional[RoutableObjectWithProvider]:
|
|
||||||
# Get from disk registry
|
# Get from disk registry
|
||||||
obj = await self.dist_registry.get(type, identifier)
|
obj = await self.dist_registry.get(type, identifier)
|
||||||
if not obj:
|
if not obj:
|
||||||
|
@ -194,13 +188,9 @@ class CommonRoutingTableImpl(RoutingTable):
|
||||||
|
|
||||||
async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
|
async def unregister_object(self, obj: RoutableObjectWithProvider) -> None:
|
||||||
await self.dist_registry.delete(obj.type, obj.identifier)
|
await self.dist_registry.delete(obj.type, obj.identifier)
|
||||||
await unregister_object_from_provider(
|
await unregister_object_from_provider(obj, self.impls_by_provider_id[obj.provider_id])
|
||||||
obj, self.impls_by_provider_id[obj.provider_id]
|
|
||||||
)
|
|
||||||
|
|
||||||
async def register_object(
|
async def register_object(self, obj: RoutableObjectWithProvider) -> RoutableObjectWithProvider:
|
||||||
self, obj: RoutableObjectWithProvider
|
|
||||||
) -> RoutableObjectWithProvider:
|
|
||||||
# if provider_id is not specified, pick an arbitrary one from existing entries
|
# if provider_id is not specified, pick an arbitrary one from existing entries
|
||||||
if not obj.provider_id and len(self.impls_by_provider_id) > 0:
|
if not obj.provider_id and len(self.impls_by_provider_id) > 0:
|
||||||
obj.provider_id = list(self.impls_by_provider_id.keys())[0]
|
obj.provider_id = list(self.impls_by_provider_id.keys())[0]
|
||||||
|
@ -258,9 +248,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
|
||||||
if model_type is None:
|
if model_type is None:
|
||||||
model_type = ModelType.llm
|
model_type = ModelType.llm
|
||||||
if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
|
if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
|
||||||
raise ValueError(
|
raise ValueError("Embedding model must have an embedding dimension in its metadata")
|
||||||
"Embedding model must have an embedding dimension in its metadata"
|
|
||||||
)
|
|
||||||
model = Model(
|
model = Model(
|
||||||
identifier=model_id,
|
identifier=model_id,
|
||||||
provider_resource_id=provider_model_id,
|
provider_resource_id=provider_model_id,
|
||||||
|
@ -280,9 +268,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
|
||||||
|
|
||||||
class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
|
class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
|
||||||
async def list_shields(self) -> ListShieldsResponse:
|
async def list_shields(self) -> ListShieldsResponse:
|
||||||
return ListShieldsResponse(
|
return ListShieldsResponse(data=await self.get_all_with_type(ResourceType.shield.value))
|
||||||
data=await self.get_all_with_type(ResourceType.shield.value)
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_shield(self, identifier: str) -> Shield:
|
async def get_shield(self, identifier: str) -> Shield:
|
||||||
shield = await self.get_object_by_identifier("shield", identifier)
|
shield = await self.get_object_by_identifier("shield", identifier)
|
||||||
|
@ -347,18 +333,14 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
|
||||||
f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
|
f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError("No provider available. Please configure a vector_io provider.")
|
||||||
"No provider available. Please configure a vector_io provider."
|
|
||||||
)
|
|
||||||
model = await self.get_object_by_identifier("model", embedding_model)
|
model = await self.get_object_by_identifier("model", embedding_model)
|
||||||
if model is None:
|
if model is None:
|
||||||
raise ValueError(f"Model {embedding_model} not found")
|
raise ValueError(f"Model {embedding_model} not found")
|
||||||
if model.model_type != ModelType.embedding:
|
if model.model_type != ModelType.embedding:
|
||||||
raise ValueError(f"Model {embedding_model} is not an embedding model")
|
raise ValueError(f"Model {embedding_model} is not an embedding model")
|
||||||
if "embedding_dimension" not in model.metadata:
|
if "embedding_dimension" not in model.metadata:
|
||||||
raise ValueError(
|
raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
|
||||||
f"Model {embedding_model} does not have an embedding dimension"
|
|
||||||
)
|
|
||||||
vector_db_data = {
|
vector_db_data = {
|
||||||
"identifier": vector_db_id,
|
"identifier": vector_db_id,
|
||||||
"type": ResourceType.vector_db.value,
|
"type": ResourceType.vector_db.value,
|
||||||
|
@ -380,9 +362,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
|
||||||
|
|
||||||
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
|
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
|
||||||
async def list_datasets(self) -> ListDatasetsResponse:
|
async def list_datasets(self) -> ListDatasetsResponse:
|
||||||
return ListDatasetsResponse(
|
return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
|
||||||
data=await self.get_all_with_type(ResourceType.dataset.value)
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_dataset(self, dataset_id: str) -> Dataset:
|
async def get_dataset(self, dataset_id: str) -> Dataset:
|
||||||
dataset = await self.get_object_by_identifier("dataset", dataset_id)
|
dataset = await self.get_object_by_identifier("dataset", dataset_id)
|
||||||
|
@ -438,14 +418,10 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
|
||||||
|
|
||||||
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
|
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
|
||||||
async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
|
async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
|
||||||
return ListScoringFunctionsResponse(
|
return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
|
||||||
data=await self.get_all_with_type(ResourceType.scoring_function.value)
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
|
async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn:
|
||||||
scoring_fn = await self.get_object_by_identifier(
|
scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id)
|
||||||
"scoring_function", scoring_fn_id
|
|
||||||
)
|
|
||||||
if scoring_fn is None:
|
if scoring_fn is None:
|
||||||
raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
|
raise ValueError(f"Scoring function '{scoring_fn_id}' not found")
|
||||||
return scoring_fn
|
return scoring_fn
|
||||||
|
@ -554,12 +530,8 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
||||||
args: Optional[Dict[str, Any]] = None,
|
args: Optional[Dict[str, Any]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
tools = []
|
tools = []
|
||||||
tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(
|
tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
|
||||||
toolgroup_id, mcp_endpoint
|
tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
|
||||||
)
|
|
||||||
tool_host = (
|
|
||||||
ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
|
|
||||||
)
|
|
||||||
|
|
||||||
for tool_def in tool_defs:
|
for tool_def in tool_defs:
|
||||||
tools.append(
|
tools.append(
|
||||||
|
|
|
@ -214,27 +214,27 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
BenchmarkInput(
|
BenchmarkInput(
|
||||||
benchmark_id="meta-reference-simpleqa",
|
benchmark_id="meta-reference-simpleqa",
|
||||||
dataset_id="simpleqa",
|
dataset_id="simpleqa",
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
grader_ids=["llm-as-judge::405b-simpleqa"],
|
||||||
),
|
),
|
||||||
BenchmarkInput(
|
BenchmarkInput(
|
||||||
benchmark_id="meta-reference-mmlu-cot",
|
benchmark_id="meta-reference-mmlu-cot",
|
||||||
dataset_id="mmlu_cot",
|
dataset_id="mmlu_cot",
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
grader_ids=["basic::regex_parser_multiple_choice_answer"],
|
||||||
),
|
),
|
||||||
BenchmarkInput(
|
BenchmarkInput(
|
||||||
benchmark_id="meta-reference-gpqa-cot",
|
benchmark_id="meta-reference-gpqa-cot",
|
||||||
dataset_id="gpqa_cot",
|
dataset_id="gpqa_cot",
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
grader_ids=["basic::regex_parser_multiple_choice_answer"],
|
||||||
),
|
),
|
||||||
BenchmarkInput(
|
BenchmarkInput(
|
||||||
benchmark_id="meta-reference-math-500",
|
benchmark_id="meta-reference-math-500",
|
||||||
dataset_id="math_500",
|
dataset_id="math_500",
|
||||||
scoring_functions=["basic::regex_parser_math_response"],
|
grader_ids=["basic::regex_parser_math_response"],
|
||||||
),
|
),
|
||||||
BenchmarkInput(
|
BenchmarkInput(
|
||||||
benchmark_id="meta-reference-bfcl",
|
benchmark_id="meta-reference-bfcl",
|
||||||
dataset_id="bfcl",
|
dataset_id="bfcl",
|
||||||
scoring_functions=["basic::bfcl"],
|
grader_ids=["basic::bfcl"],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
return DistributionTemplate(
|
return DistributionTemplate(
|
||||||
|
|
|
@ -196,27 +196,27 @@ datasets:
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
benchmarks:
|
benchmarks:
|
||||||
- dataset_id: simpleqa
|
- dataset_id: simpleqa
|
||||||
scoring_functions:
|
grader_ids:
|
||||||
- llm-as-judge::405b-simpleqa
|
- llm-as-judge::405b-simpleqa
|
||||||
metadata: {}
|
metadata: {}
|
||||||
benchmark_id: meta-reference-simpleqa
|
benchmark_id: meta-reference-simpleqa
|
||||||
- dataset_id: mmlu_cot
|
- dataset_id: mmlu_cot
|
||||||
scoring_functions:
|
grader_ids:
|
||||||
- basic::regex_parser_multiple_choice_answer
|
- basic::regex_parser_multiple_choice_answer
|
||||||
metadata: {}
|
metadata: {}
|
||||||
benchmark_id: meta-reference-mmlu-cot
|
benchmark_id: meta-reference-mmlu-cot
|
||||||
- dataset_id: gpqa_cot
|
- dataset_id: gpqa_cot
|
||||||
scoring_functions:
|
grader_ids:
|
||||||
- basic::regex_parser_multiple_choice_answer
|
- basic::regex_parser_multiple_choice_answer
|
||||||
metadata: {}
|
metadata: {}
|
||||||
benchmark_id: meta-reference-gpqa-cot
|
benchmark_id: meta-reference-gpqa-cot
|
||||||
- dataset_id: math_500
|
- dataset_id: math_500
|
||||||
scoring_functions:
|
grader_ids:
|
||||||
- basic::regex_parser_math_response
|
- basic::regex_parser_math_response
|
||||||
metadata: {}
|
metadata: {}
|
||||||
benchmark_id: meta-reference-math-500
|
benchmark_id: meta-reference-math-500
|
||||||
- dataset_id: bfcl
|
- dataset_id: bfcl
|
||||||
scoring_functions:
|
grader_ids:
|
||||||
- basic::bfcl
|
- basic::bfcl
|
||||||
metadata: {}
|
metadata: {}
|
||||||
benchmark_id: meta-reference-bfcl
|
benchmark_id: meta-reference-bfcl
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue