diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index 5a4fafd4e..bc9215993 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -37,7 +37,7 @@ class EvaluateTaskRequestCommon(BaseModel): class EvaluateResponse(BaseModel): """Scores for evaluation.""" - metrics: Dict[str, float] + metrics: Dict[str, str] @json_schema_type diff --git a/llama_stack/providers/adapters/evals/__init__.py b/llama_stack/providers/adapters/evals/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/adapters/evals/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/adapters/evals/eleuther/__init__.py b/llama_stack/providers/adapters/evals/eleuther/__init__.py new file mode 100644 index 000000000..9886ed6d6 --- /dev/null +++ b/llama_stack/providers/adapters/evals/eleuther/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import EleutherEvalsImplConfig # noqa +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.distribution.datatypes import Api, ProviderSpec + + +async def get_provider_impl( + config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec] +): + from .eleuther import EleutherEvalsAdapter + + impl = EleutherEvalsAdapter(config, deps[Api.inference]) + await impl.initialize() + return impl diff --git a/llama_stack/providers/adapters/evals/eleuther/config.py b/llama_stack/providers/adapters/evals/eleuther/config.py new file mode 100644 index 000000000..a9ab297b4 --- /dev/null +++ b/llama_stack/providers/adapters/evals/eleuther/config.py @@ -0,0 +1,10 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pydantic import BaseModel + + +class EleutherEvalsImplConfig(BaseModel): ... diff --git a/llama_stack/providers/adapters/evals/eleuther/eleuther.py b/llama_stack/providers/adapters/evals/eleuther/eleuther.py new file mode 100644 index 000000000..ee51adf35 --- /dev/null +++ b/llama_stack/providers/adapters/evals/eleuther/eleuther.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.apis.evals import * # noqa: F403 +import random + +import lm_eval +from lm_eval.api.model import LM +from lm_eval.evaluator import evaluate, get_task_list +from lm_eval.tasks import get_task_dict, TaskManager + +from .config import EleutherEvalsImplConfig # noqa + + +class EleutherEvalsWrapper(LM): + def __init__( + self, + inference_api: Inference, + **kwargs, + ): + super().__init__(**kwargs) + self.inference_api = inference_api + self.tokenizer = None + self.tokenized_requests = False + self.kwargs = kwargs + + @property + def eot_token_id(self): + raise NotImplementedError("Not implemented") + + @property + def max_length(self) -> int: + return NotImplementedError("Not implemented") + + @property + def max_gen_toks(self) -> int: + return NotImplementedError("Not implemented") + + @property + def batch_size(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + @property + def device(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError("No support for logits.") + + @property + def world_size(self): + return 1 + + def tok_encode(self, string: str) -> List[int]: + return NotImplementedError("Not implemented") + + def tok_decode(self, tokens: List[int]) -> str: + return NotImplementedError("Not implemented") + + def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override generate_until + raise NotImplementedError() + + def loglikelihood(self, requests, disable_tqdm: bool = False): + # TODO: implement inference completion with loglikelihood + res = [] + for req in requests: + res.append((-random.random(), False)) + + return res + + def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): + raise NotImplementedError("No support for logits.") + + def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: + return NotImplementedError("Not implemented") + + +class EleutherEvalsAdapter(Evals): + def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference): + self.inference_api = inference_api + self.eluther_wrapper = EleutherEvalsWrapper(inference_api) + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def run_evals( + self, + model: str, + dataset: str, + task: str, + ) -> EvaluateResponse: + task_manager = TaskManager() + task_dict = get_task_dict(task, task_manager) + task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)]) + + output = evaluate( + self.eluther_wrapper, + task_dict, + limit=2, + ) + formatted_output = lm_eval.utils.make_table(output) + return EvaluateResponse( + metrics={ + "metrics_table": formatted_output, + }, + ) diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py index 8c98cb199..c1630aa07 100644 --- a/llama_stack/providers/registry/evals.py +++ b/llama_stack/providers/registry/evals.py @@ -27,4 +27,16 @@ def available_providers() -> List[ProviderSpec]: Api.inference, ], ), + InlineProviderSpec( + api=Api.evals, + provider_type="eleuther", + pip_packages=[ + "lm-eval", + ], + module="llama_stack.providers.adapters.evals.eleuther", + config_class="llama_stack.providers.adapters.evals.eleuther.EleutherEvalsImplConfig", + api_dependencies=[ + Api.inference, + ], + ), ] diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index 4a3a98de2..fa082a58c 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -13,7 +13,7 @@ apis_to_serve: - evals api_providers: evals: - provider_type: meta-reference + provider_type: eleuther config: {} inference: providers: