diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index a1f696dff..2a6947b32 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -42,12 +42,21 @@ class EvaluationClient(Evals): async def run_main(host: str, port: int): client = EvaluationClient(f"http://{host}:{port}") + # CustomDataset + # response = await client.run_evals( + # "Llama3.1-8B-Instruct", + # "mmlu-simple-eval-en", + # "mmlu", + # ) + # cprint(f"evaluate response={response}", "green") + + # Eleuther Eval response = await client.run_evals( "Llama3.1-8B-Instruct", - "mmlu-simple-eval-en", + "PLACEHOLDER_DATASET_NAME", "mmlu", ) - cprint(f"evaluate response={response}", "green") + cprint(response.metrics["metrics_table"], "red") def main(host: str, port: int): diff --git a/llama_stack/apis/inference/client.py b/llama_stack/apis/inference/client.py index 2aae1cc55..92acc3e14 100644 --- a/llama_stack/apis/inference/client.py +++ b/llama_stack/apis/inference/client.py @@ -109,7 +109,7 @@ async def run_main(host: str, port: int, stream: bool): cprint(f"User>{message.content}", "green") iterator = client.chat_completion( model="Llama3.1-8B-Instruct", - messages=[message, UserMessage(content="write me 3 sentence about the sun.")], + messages=[message], stream=stream, ) async for log in EventLogger().log(iterator): diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py index 4f2939db1..d74476628 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/mmlu_task.py @@ -146,5 +146,5 @@ class MMLUTask(BaseTask): def aggregate_results(self, eval_results): return EvaluateResponse( - metrics={"score": sum(eval_results) / len(eval_results)} + metrics={"score": str(sum(eval_results) / len(eval_results))} ) diff --git a/llama_stack/providers/adapters/evals/__init__.py b/llama_stack/providers/impls/third_party/evals/__init__.py similarity index 100% rename from llama_stack/providers/adapters/evals/__init__.py rename to llama_stack/providers/impls/third_party/evals/__init__.py diff --git a/llama_stack/providers/adapters/evals/eleuther/__init__.py b/llama_stack/providers/impls/third_party/evals/eleuther/__init__.py similarity index 100% rename from llama_stack/providers/adapters/evals/eleuther/__init__.py rename to llama_stack/providers/impls/third_party/evals/eleuther/__init__.py diff --git a/llama_stack/providers/adapters/evals/eleuther/config.py b/llama_stack/providers/impls/third_party/evals/eleuther/config.py similarity index 100% rename from llama_stack/providers/adapters/evals/eleuther/config.py rename to llama_stack/providers/impls/third_party/evals/eleuther/config.py diff --git a/llama_stack/providers/adapters/evals/eleuther/eleuther.py b/llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py similarity index 100% rename from llama_stack/providers/adapters/evals/eleuther/eleuther.py rename to llama_stack/providers/impls/third_party/evals/eleuther/eleuther.py diff --git a/llama_stack/providers/registry/evals.py b/llama_stack/providers/registry/evals.py index c1630aa07..8693ec603 100644 --- a/llama_stack/providers/registry/evals.py +++ b/llama_stack/providers/registry/evals.py @@ -33,8 +33,8 @@ def available_providers() -> List[ProviderSpec]: pip_packages=[ "lm-eval", ], - module="llama_stack.providers.adapters.evals.eleuther", - config_class="llama_stack.providers.adapters.evals.eleuther.EleutherEvalsImplConfig", + module="llama_stack.providers.impls.third_party.evals.eleuther", + config_class="llama_stack.providers.impls.third_party.evals.eleuther.EleutherEvalsImplConfig", api_dependencies=[ Api.inference, ], diff --git a/tests/examples/local-run.yaml b/tests/examples/local-run.yaml index fa082a58c..4a616bc88 100644 --- a/tests/examples/local-run.yaml +++ b/tests/examples/local-run.yaml @@ -15,6 +15,9 @@ api_providers: evals: provider_type: eleuther config: {} + # evals: + # provider_type: meta-reference + # config: {} inference: providers: - meta-reference