skeleton evaluation implementation

2025-12-31 03:43:53 +00:00 · 2025-03-19 10:35:33 -07:00 · 2025-03-19 10:35:33 -07:00 · 9650549a54
commit 9650549a54
parent 443b18a992
6 changed files with 168 additions and 0 deletions
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -22,6 +22,7 @@ class Api(Enum):
    datasetio = "datasetio"
    post_training = "post_training"
    tool_runtime = "tool_runtime"
    evaluation = "evaluation"
    telemetry = "telemetry"
--- a/llama_stack/providers/inline/evaluation/meta_reference/init.py
+++ b/llama_stack/providers/inline/evaluation/meta_reference/init.py
@ -0,0 +1,27 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict
 from llama_stack.distribution.datatypes import Api
 from .config import MetaReferenceEvaluationConfig
 async def get_provider_impl(
    config: MetaReferenceEvaluationConfig,
    deps: Dict[Api, Any],
 ):
    from .evaluation import MetaReferenceEvaluationImpl
    impl = MetaReferenceEvaluationImpl(
        config,
        deps[Api.datasetio],
        deps[Api.datasets],
        deps[Api.inference],
        deps[Api.agents],
    )
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/evaluation/meta_reference/config.py
+++ b/llama_stack/providers/inline/evaluation/meta_reference/config.py
@ -0,0 +1,26 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
 )
 from pydantic import BaseModel
 class MetaReferenceEvaluationConfig(BaseModel):
    kvstore: KVStoreConfig
    @classmethod
    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
        return {
            "kvstore": SqliteKVStoreConfig.sample_run_config(
                __distro_dir__=__distro_dir__,
                db_name="meta_reference_evaluation.db",
            )
        }
--- a/llama_stack/providers/inline/evaluation/meta_reference/evaluation.py
+++ b/llama_stack/providers/inline/evaluation/meta_reference/evaluation.py
@ -0,0 +1,77 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 from typing import Any, Dict, List, Optional
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.inference import Inference
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from tqdm import tqdm
 from .....apis.benchmarks import Benchmark
 from .....apis.common.job_types import Job
 from .....apis.evaluation.evaluation import (
    Evaluation,
    EvaluationCandidate,
    EvaluationJob,
    EvaluationResponse,
    EvaluationTask,
 )
 from .config import MetaReferenceEvaluationConfig
 EVAL_TASKS_PREFIX = "benchmarks:"
 class MetaReferenceEvaluationImpl(
    Evaluation,
    BenchmarksProtocolPrivate,
 ):
    def __init__(
        self,
        config: MetaReferenceEvaluationConfig,
        datasetio_api: DatasetIO,
        datasets_api: Datasets,
        inference_api: Inference,
        agents_api: Agents,
    ) -> None:
        self.config = config
        self.datasetio_api = datasetio_api
        self.datasets_api = datasets_api
        self.inference_api = inference_api
        self.agents_api = agents_api
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        pass
    async def register_benchmark(self, benchmark: Benchmark) -> None:
        pass
    async def run(
        self,
        task: EvaluationTask,
        candidate: EvaluationCandidate,
    ) -> EvaluationJob:
        raise NotImplementedError("Run is not implemented yet")
    async def run_sync(
        self,
        task: EvaluationTask,
        candidate: EvaluationCandidate,
    ) -> EvaluationResponse:
        raise NotImplementedError("Run sync is not implemented yet")
    async def grade(self, task: EvaluationTask) -> EvaluationJob:
        raise NotImplementedError("Grade is not implemented yet")
    async def grade_sync(self, task: EvaluationTask) -> EvaluationResponse:
        raise NotImplementedError("Grade sync is not implemented yet")
--- a/llama_stack/providers/registry/evaluation.py
+++ b/llama_stack/providers/registry/evaluation.py
@ -0,0 +1,36 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import List
 from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
 from llama_stack.providers.utils.kvstore import kvstore_dependencies
 def available_providers() -> List[ProviderSpec]:
    return [
        InlineProviderSpec(
            api=Api.evaluation,
            provider_type="inline::meta-reference",
            pip_packages=[
                "matplotlib",
                "pillow",
                "pandas",
                "scikit-learn",
            ]
            + kvstore_dependencies(),
            module="llama_stack.providers.inline.evaluation.meta_reference",
            config_class="llama_stack.providers.inline.evaluation.meta_reference.MetaReferenceEvaluationImplConfig",
            api_dependencies=[
                Api.inference,
                Api.safety,
                Api.vector_io,
                Api.vector_dbs,
                Api.tool_runtime,
                Api.tool_groups,
            ],
        ),
    ]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -215,6 +215,7 @@ exclude = [
    "^llama_stack/providers/inline/agents/meta_reference/agent_instance\\.py$",
    "^llama_stack/providers/inline/agents/meta_reference/agents\\.py$",
    "^llama_stack/providers/inline/agents/meta_reference/safety\\.py$",
    "^llama_stack/providers/inline/evaluation/meta_reference/evaluation\\.py$",
    "^llama_stack/providers/inline/datasetio/localfs/",
    "^llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
    "^llama_stack/providers/inline/inference/meta_reference/config\\.py$",