From 1a07703d8b81e7188bbd3b6d6e1f3ba407726e42 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Sat, 15 Mar 2025 17:23:37 -0700 Subject: [PATCH] fix notebook --- .../ui/page/evaluations/native_eval.py | 4 +--- tests/integration/eval/test_eval.py | 16 ++++------------ 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index fc7568022..7c39adc4a 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -229,9 +229,7 @@ def run_evaluation_3(): output_res[scoring_fn] = [] output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) - progress_text_container.write( - f"Expand to see current processed result ({i + 1} / {len(rows)})" - ) + progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})") results_container.json(eval_res, expanded=2) progress_bar.progress(1.0, text="Evaluation complete!") diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py index 6f2c9670e..e25daabbe 100644 --- a/tests/integration/eval/test_eval.py +++ b/tests/integration/eval/test_eval.py @@ -16,9 +16,7 @@ from ..datasetio.test_datasetio import register_dataset @pytest.mark.parametrize("scoring_fn_id", ["basic::equality"]) def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): - register_dataset( - llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval" - ) + register_dataset(llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval") response = llama_stack_client.datasets.list() assert any(x.identifier == "test_dataset_for_eval" for x in response) @@ -61,9 +59,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): @pytest.mark.parametrize("scoring_fn_id", ["basic::subset_of"]) def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): - register_dataset( - llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval_2" - ) + register_dataset(llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval_2") benchmark_id = str(uuid.uuid4()) llama_stack_client.benchmarks.register( benchmark_id=benchmark_id, @@ -84,14 +80,10 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): }, ) assert response.job_id == "0" - job_status = llama_stack_client.eval.jobs.status( - job_id=response.job_id, benchmark_id=benchmark_id - ) + job_status = llama_stack_client.eval.jobs.status(job_id=response.job_id, benchmark_id=benchmark_id) assert job_status and job_status == "completed" - eval_response = llama_stack_client.eval.jobs.retrieve( - job_id=response.job_id, benchmark_id=benchmark_id - ) + eval_response = llama_stack_client.eval.jobs.retrieve(job_id=response.job_id, benchmark_id=benchmark_id) assert eval_response is not None assert len(eval_response.generations) == 5 assert scoring_fn_id in eval_response.scores