eleuther eval provider

2025-07-29 15:23:51 +00:00 · 2024-10-04 13:45:52 -07:00 · 2024-10-04 13:45:52 -07:00 · 6234dd97d5
commit 6234dd97d5
parent 2441e66d14
7 changed files with 168 additions and 2 deletions
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@ -37,7 +37,7 @@ class EvaluateTaskRequestCommon(BaseModel):
 class EvaluateResponse(BaseModel):
    """Scores for evaluation."""
-    metrics: Dict[str, float]
+    metrics: Dict[str, str]
@json_schema_type
--- a/llama_stack/providers/adapters/evals/init.py
+++ b/llama_stack/providers/adapters/evals/init.py
@ -0,0 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/llama_stack/providers/adapters/evals/eleuther/init.py
+++ b/llama_stack/providers/adapters/evals/eleuther/init.py
@ -0,0 +1,19 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .config import EleutherEvalsImplConfig  # noqa
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.distribution.datatypes import Api, ProviderSpec
 async def get_provider_impl(
    config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec]
 ):
    from .eleuther import EleutherEvalsAdapter
    impl = EleutherEvalsAdapter(config, deps[Api.inference])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/adapters/evals/eleuther/config.py
+++ b/llama_stack/providers/adapters/evals/eleuther/config.py
@ -0,0 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from pydantic import BaseModel
 class EleutherEvalsImplConfig(BaseModel): ...
--- a/llama_stack/providers/adapters/evals/eleuther/eleuther.py
+++ b/llama_stack/providers/adapters/evals/eleuther/eleuther.py
@ -0,0 +1,120 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.evals import *  # noqa: F403
 import random
 import lm_eval
 from lm_eval.api.model import LM
 from lm_eval.evaluator import evaluate, get_task_list
 from lm_eval.tasks import get_task_dict, TaskManager
 from .config import EleutherEvalsImplConfig  # noqa
 class EleutherEvalsWrapper(LM):
    def __init__(
        self,
        inference_api: Inference,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.inference_api = inference_api
        self.tokenizer = None
        self.tokenized_requests = False
        self.kwargs = kwargs
    @property
    def eot_token_id(self):
        raise NotImplementedError("Not implemented")
    @property
    def max_length(self) -> int:
        return NotImplementedError("Not implemented")
    @property
    def max_gen_toks(self) -> int:
        return NotImplementedError("Not implemented")
    @property
    def batch_size(self):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError("No support for logits.")
    @property
    def device(self):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError("No support for logits.")
    @property
    def world_size(self):
        return 1
    def tok_encode(self, string: str) -> List[int]:
        return NotImplementedError("Not implemented")
    def tok_decode(self, tokens: List[int]) -> str:
        return NotImplementedError("Not implemented")
    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
    def _model_call(self, inps):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()
    def _model_generate(self, context, max_length, eos_token_id):
        # Isn't used because we override generate_until
        raise NotImplementedError()
    def loglikelihood(self, requests, disable_tqdm: bool = False):
        # TODO: implement inference completion with loglikelihood
        res = []
        for req in requests:
            res.append((-random.random(), False))
        return res
    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        return NotImplementedError("Not implemented")
 class EleutherEvalsAdapter(Evals):
    def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference):
        self.inference_api = inference_api
        self.eluther_wrapper = EleutherEvalsWrapper(inference_api)
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        pass
    async def run_evals(
        self,
        model: str,
        dataset: str,
        task: str,
    ) -> EvaluateResponse:
        task_manager = TaskManager()
        task_dict = get_task_dict(task, task_manager)
        task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)])
        output = evaluate(
            self.eluther_wrapper,
            task_dict,
            limit=2,
        )
        formatted_output = lm_eval.utils.make_table(output)
        return EvaluateResponse(
            metrics={
                "metrics_table": formatted_output,
            },
        )
--- a/llama_stack/providers/registry/evals.py
+++ b/llama_stack/providers/registry/evals.py
@ -27,4 +27,16 @@ def available_providers() -> List[ProviderSpec]:
                Api.inference,
            ],
        ),
        InlineProviderSpec(
            api=Api.evals,
            provider_type="eleuther",
            pip_packages=[
                "lm-eval",
            ],
            module="llama_stack.providers.adapters.evals.eleuther",
            config_class="llama_stack.providers.adapters.evals.eleuther.EleutherEvalsImplConfig",
            api_dependencies=[
                Api.inference,
            ],
        ),
    ]
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@ -13,7 +13,7 @@ apis_to_serve:
 - evals
 api_providers:
  evals:
-    provider_type: meta-reference
+    provider_type: eleuther
    config: {}
  inference:
    providers: