From fb565dfb066addeacad8b31386c81d0c24787b2c Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Fri, 11 Oct 2024 09:30:10 -0700 Subject: [PATCH] eleuther eval fix --- llama_stack/apis/evals/client.py | 32 ++++++++++--------- .../third_party/evals/eleuther/eleuther.py | 8 +++-- tests/examples/local-run.yaml | 4 +-- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index ad4a47145..bde78adc9 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -54,27 +54,29 @@ class EvaluationClient(Evals): async def run_main(host: str, port: int): client = EvaluationClient(f"http://{host}:{port}") - # CustomDataset + # Custom Eval Task + # response = await client.run_evals( + # model="Llama3.1-8B-Instruct", + # dataset="mmlu-simple-eval-en", + # task="mmlu", + # eval_task_config=EvaluateTaskConfig( + # n_samples=2, + # ), + # ) + + # Eleuther Eval Task response = await client.run_evals( model="Llama3.1-8B-Instruct", - dataset="mmlu-simple-eval-en", - task="mmlu", + # task="meta_mmlu_pro_instruct", + task="meta_ifeval", eval_task_config=EvaluateTaskConfig( n_samples=2, ), ) - cprint(f"evaluate response={response}", "green") - - # Eleuther Eval Task - # response = await client.run_evals( - # model="Llama3.1-8B-Instruct", - # task="meta_mmlu_pro_instruct", - # # task="meta_ifeval", - # eval_task_config=EvaluateTaskConfig( - # n_samples=2, - # ) - # ) - # cprint(response.metrics["metrics_table"], "red") + if response.formatted_report: + cprint(response.formatted_report, "green") + else: + cprint(f"evaluate response={response}", "green") def main(host: str, port: int): diff --git a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py index b9f9505e9..e4b32a45e 100644 --- a/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py +++ b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py @@ -157,12 +157,14 @@ class EleutherEvalsAdapter(Evals): limit=eval_task_config.n_samples, ) + eval_result = EvalResult( + metrics={}, + ) formatted_output = lm_eval.utils.make_table(output) cprint(formatted_output, "green") return EvaluateResponse( - metrics={ - "metrics_table": formatted_output, - }, + eval_result=eval_result, + formatted_report=formatted_output, ) diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index 3c9f73e0b..430ce6102 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -14,8 +14,8 @@ apis: - evals providers: evals: - - provider_id: meta-reference - provider_type: meta-reference + - provider_id: eleuther + provider_type: eleuther config: {} inference: - provider_id: remote::tgi