diff --git a/llama_stack/apis/evals/client.py b/llama_stack/apis/evals/client.py index 07877c13e..1e76812c6 100644 --- a/llama_stack/apis/evals/client.py +++ b/llama_stack/apis/evals/client.py @@ -46,23 +46,13 @@ class EvaluationClient(Evals): async def run_evals( self, - model: str, - task: str, - dataset: Optional[str] = None, - eval_task_config: Optional[EvaluateTaskConfig] = None, + eval_task_config: EvaluateTaskConfig, ) -> EvaluateResponse: async with httpx.AsyncClient() as client: response = await client.post( f"{self.base_url}/evals/run_eval_task", json={ - "model": model, - "task": task, - "dataset": dataset, - "eval_task_config": ( - json.loads(eval_task_config.json()) - if eval_task_config - else None - ), + "eval_task_config": json.loads(eval_task_config.json()), }, headers={"Content-Type": "application/json"}, timeout=3600, @@ -94,85 +84,88 @@ async def run_main(host: str, port: int, eval_dataset_path: str = ""): dataset_client = DatasetsClient(f"http://{host}:{port}") # Full Eval Task - - # # 1. register custom dataset - # response = await dataset_client.create_dataset( - # dataset_def=CustomDatasetDef( - # identifier="mmlu-simple-eval-en", - # url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", - # ), - # ) - # cprint(f"datasets/create: {response}", "cyan") - - # # 2. run evals on the registered dataset - # response = await client.run_evals( - # model="Llama3.1-8B-Instruct", - # dataset="mmlu-simple-eval-en", - # task="mmlu", - # ) - - # if response.formatted_report: - # cprint(response.formatted_report, "green") - # else: - # cprint(f"Response: {response}", "green") - - # Scoring Task - # 1. register huggingface dataset - response = await dataset_client.create_dataset( - dataset_def=HuggingfaceDatasetDef( - identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", - dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - rename_columns_map={ - "output_parsed_answer": "generated_answer", - "input_correct_responses": "expected_answer", - }, - kwargs={"split": "latest"}, - ) - ) - cprint(response, "cyan") - + # 1. register custom dataset response = await dataset_client.create_dataset( dataset_def=CustomDatasetDef( - identifier="rag-evals", - url=data_url_from_file(eval_dataset_path), - rename_columns_map={ - "query": "input_query", - }, - ) - ) - cprint(response, "cyan") - - # 2. run evals on the registered dataset - response = await client.run_scorer( - dataset_config=EvaluateDatasetConfig( - dataset_identifier="rag-evals", - # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", - row_limit=10, + identifier="mmlu-simple-eval-en", + url="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", ), - eval_scoring_config=EvaluateScoringConfig( + ) + cprint(f"datasets/create: {response}", "cyan") + + # # 2. run evals on the registered dataset + eval_task_config = EvaluateTaskConfig( + dataset_config=EvaluateDatasetConfig( + dataset_identifier="mmlu-simple-eval-en", + row_limit=3, + ), + processor_config=EvaluateProcessorConfig( + processor_identifier="mmlu", + ), + generation_config=EvaluateModelGenerationConfig( + model="Llama3.1-8B-Instruct", + ), + scoring_config=EvaluateScoringConfig( scorer_config_list=[ EvaluateSingleScorerConfig(scorer_name="accuracy"), - EvaluateSingleScorerConfig( - scorer_name="braintrust::answer-correctness" - ), + EvaluateSingleScorerConfig(scorer_name="random"), ] ), ) - + response = await client.run_evals( + eval_task_config=eval_task_config, + ) for k, v in response.eval_result.metrics.items(): cprint(f"{k}: {v}", "green") - # Eleuther Eval Task - # response = await client.run_evals( - # model="Llama3.1-8B-Instruct", - # # task="meta_mmlu_pro_instruct", - # task="meta_ifeval", - # eval_task_config=EvaluateTaskConfig( - # n_samples=2, + # Scoring Task + # # 1. register huggingface dataset + # response = await dataset_client.create_dataset( + # dataset_def=HuggingfaceDatasetDef( + # identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + # dataset_path="meta-llama/Llama-3.1-8B-Instruct-evals", + # dataset_name="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + # rename_columns_map={ + # "output_parsed_answer": "generated_answer", + # "input_correct_responses": "expected_answer", + # }, + # kwargs={"split": "latest"}, + # ) + # ) + # cprint(response, "cyan") + + # # register custom dataset from file path + # response = await dataset_client.create_dataset( + # dataset_def=CustomDatasetDef( + # identifier="rag-evals", + # url=data_url_from_file(eval_dataset_path), + # rename_columns_map={ + # "query": "input_query", + # }, + # ) + # ) + # cprint(response, "cyan") + + # # 2. run evals on the registered dataset + # response = await client.run_scorer( + # dataset_config=EvaluateDatasetConfig( + # dataset_identifier="rag-evals", + # # dataset_identifier="Llama-3.1-8B-Instruct-evals__mmlu_pro__details", + # row_limit=10, + # ), + # eval_scoring_config=EvaluateScoringConfig( + # scorer_config_list=[ + # EvaluateSingleScorerConfig(scorer_name="accuracy"), + # EvaluateSingleScorerConfig( + # scorer_name="braintrust::answer-correctness" + # ), + # ] # ), # ) + # for k, v in response.eval_result.metrics.items(): + # cprint(f"{k}: {v}", "green") + def main(host: str, port: int, eval_dataset_path: str = ""): asyncio.run(run_main(host, port, eval_dataset_path)) diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index a02394ee4..c484db734 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -228,10 +228,7 @@ class Evals(Protocol): @webmethod(route="/evals/run_eval_task") async def run_eval_task( self, - model: str, - task: str, - dataset: Optional[str] = None, - eval_task_config: Optional[EvaluateTaskConfig] = None, + eval_task_config: EvaluateTaskConfig, ) -> EvaluateResponse: ... @webmethod(route="/evals/run_scorer") diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 916e40e3a..a9e2c641f 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -28,39 +28,9 @@ class MetaReferenceEvalsImpl(Evals): async def run_eval_task( self, - model: str, - task: str, - dataset: Optional[str] = None, - eval_task_config: Optional[EvaluateTaskConfig] = None, + eval_task_config: EvaluateTaskConfig, ) -> EvaluateResponse: - cprint( - f"model={model}, dataset={dataset}, task={task}, eval_task_config={eval_task_config}", - "red", - ) - - if not dataset: - raise ValueError("dataset must be specified for mete-reference evals") - - if not eval_task_config: - # construct eval task config from inputs - eval_task_config = EvaluateTaskConfig( - dataset_config=EvaluateDatasetConfig( - dataset_identifier=dataset, - row_limit=3, - ), - processor_config=EvaluateProcessorConfig( - processor_identifier="mmlu", - ), - generation_config=EvaluateModelGenerationConfig( - model=model, - ), - scoring_config=EvaluateScoringConfig( - scorer_config_list=[ - EvaluateSingleScorerConfig(scorer_name="accuracy"), - EvaluateSingleScorerConfig(scorer_name="random"), - ] - ), - ) + cprint(f"run_eval_task: on {eval_task_config}", "green") run_task = RunEvalTask() eval_result = await run_task.run(eval_task_config, self.inference_api) @@ -75,7 +45,7 @@ class MetaReferenceEvalsImpl(Evals): dataset_config: EvaluateDatasetConfig, eval_scoring_config: EvaluateScoringConfig, ) -> EvaluateResponse: - cprint("run_scorer") + cprint(f"run_scorer: on {dataset_config} with {eval_scoring_config}", "green") run_task = RunScoringTask() eval_result = await run_task.run(dataset_config, eval_scoring_config)