diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py index d1c3de519..01581e829 100644 --- a/tests/integration/eval/test_eval.py +++ b/tests/integration/eval/test_eval.py @@ -45,7 +45,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): list_benchmarks = llama_stack_client.benchmarks.list() assert any(x.identifier == benchmark_id for x in list_benchmarks) - response = llama_stack_client.eval.evaluate_rows( + response = llama_stack_client.alpha.eval.evaluate_rows( benchmark_id=benchmark_id, input_rows=rows.data, scoring_functions=scoring_functions, @@ -80,7 +80,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): scoring_functions=[scoring_fn_id], ) - response = llama_stack_client.eval.run_eval( + response = llama_stack_client.alpha.eval.run_eval( benchmark_id=benchmark_id, benchmark_config={ "eval_candidate": { @@ -93,10 +93,10 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): }, ) assert response.job_id == "0" - job_status = llama_stack_client.eval.jobs.status(job_id=response.job_id, benchmark_id=benchmark_id) + job_status = llama_stack_client.alpha.eval.jobs.status(job_id=response.job_id, benchmark_id=benchmark_id) assert job_status and job_status.status == "completed" - eval_response = llama_stack_client.eval.jobs.retrieve(job_id=response.job_id, benchmark_id=benchmark_id) + eval_response = llama_stack_client.alpha.eval.jobs.retrieve(job_id=response.job_id, benchmark_id=benchmark_id) assert eval_response is not None assert len(eval_response.generations) == 5 assert scoring_fn_id in eval_response.scores