diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py
index 5a4fafd4e..bc9215993 100644
--- a/llama_stack/apis/evals/evals.py
+++ b/llama_stack/apis/evals/evals.py
@@ -37,7 +37,7 @@ class EvaluateTaskRequestCommon(BaseModel):
 class EvaluateResponse(BaseModel):
     """Scores for evaluation."""
 
-    metrics: Dict[str, float]
+    metrics: Dict[str, str]
 
 
 @json_schema_type
diff --git a/llama_stack/providers/adapters/evals/__init__.py b/llama_stack/providers/adapters/evals/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/adapters/evals/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/adapters/evals/eleuther/__init__.py b/llama_stack/providers/adapters/evals/eleuther/__init__.py
new file mode 100644
index 000000000..9886ed6d6
--- /dev/null
+++ b/llama_stack/providers/adapters/evals/eleuther/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import EleutherEvalsImplConfig  # noqa
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+
+async def get_provider_impl(
+    config: EleutherEvalsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+    from .eleuther import EleutherEvalsAdapter
+
+    impl = EleutherEvalsAdapter(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/adapters/evals/eleuther/config.py b/llama_stack/providers/adapters/evals/eleuther/config.py
new file mode 100644
index 000000000..a9ab297b4
--- /dev/null
+++ b/llama_stack/providers/adapters/evals/eleuther/config.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class EleutherEvalsImplConfig(BaseModel): ...
diff --git a/llama_stack/providers/adapters/evals/eleuther/eleuther.py b/llama_stack/providers/adapters/evals/eleuther/eleuther.py
new file mode 100644
index 000000000..ee51adf35
--- /dev/null
+++ b/llama_stack/providers/adapters/evals/eleuther/eleuther.py
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.evals import *  # noqa: F403
+import random
+
+import lm_eval
+from lm_eval.api.model import LM
+from lm_eval.evaluator import evaluate, get_task_list
+from lm_eval.tasks import get_task_dict, TaskManager
+
+from .config import EleutherEvalsImplConfig  # noqa
+
+
+class EleutherEvalsWrapper(LM):
+    def __init__(
+        self,
+        inference_api: Inference,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.inference_api = inference_api
+        self.tokenizer = None
+        self.tokenized_requests = False
+        self.kwargs = kwargs
+
+    @property
+    def eot_token_id(self):
+        raise NotImplementedError("Not implemented")
+
+    @property
+    def max_length(self) -> int:
+        return NotImplementedError("Not implemented")
+
+    @property
+    def max_gen_toks(self) -> int:
+        return NotImplementedError("Not implemented")
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError("No support for logits.")
+
+    @property
+    def world_size(self):
+        return 1
+
+    def tok_encode(self, string: str) -> List[int]:
+        return NotImplementedError("Not implemented")
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return NotImplementedError("Not implemented")
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override generate_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        # TODO: implement inference completion with loglikelihood
+        res = []
+        for req in requests:
+            res.append((-random.random(), False))
+
+        return res
+
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        raise NotImplementedError("No support for logits.")
+
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        return NotImplementedError("Not implemented")
+
+
+class EleutherEvalsAdapter(Evals):
+    def __init__(self, config: EleutherEvalsImplConfig, inference_api: Inference):
+        self.inference_api = inference_api
+        self.eluther_wrapper = EleutherEvalsWrapper(inference_api)
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def run_evals(
+        self,
+        model: str,
+        dataset: str,
+        task: str,
+    ) -> EvaluateResponse:
+        task_manager = TaskManager()
+        task_dict = get_task_dict(task, task_manager)
+        task_types = set([t.task.OUTPUT_TYPE for t in get_task_list(task_dict)])
+
+        output = evaluate(
+            self.eluther_wrapper,
+            task_dict,
+            limit=2,
+        )
+        formatted_output = lm_eval.utils.make_table(output)
+        return EvaluateResponse(
+            metrics={
+                "metrics_table": formatted_output,
+            },
+        )
diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py
index 8c98cb199..c1630aa07 100644
--- a/llama_stack/providers/registry/evals.py
+++ b/llama_stack/providers/registry/evals.py
@@ -27,4 +27,16 @@ def available_providers() -> List[ProviderSpec]:
                 Api.inference,
             ],
         ),
+        InlineProviderSpec(
+            api=Api.evals,
+            provider_type="eleuther",
+            pip_packages=[
+                "lm-eval",
+            ],
+            module="llama_stack.providers.adapters.evals.eleuther",
+            config_class="llama_stack.providers.adapters.evals.eleuther.EleutherEvalsImplConfig",
+            api_dependencies=[
+                Api.inference,
+            ],
+        ),
     ]
diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml
index 4a3a98de2..fa082a58c 100644
--- a/tests/examples/local-run.yaml
+++ b/tests/examples/local-run.yaml
@@ -13,7 +13,7 @@ apis_to_serve:
 - evals
 api_providers:
   evals:
-    provider_type: meta-reference
+    provider_type: eleuther
     config: {}
   inference:
     providers: