work eval

2025-08-12 04:50:39 +00:00 · 2025-03-05 17:12:47 -08:00 · 2025-03-05 17:12:47 -08:00 · 6e65b9282d
commit 6e65b9282d
parent fd68b0dc9a
2 changed files with 50 additions and 9 deletions
--- a/tests/integration/datasetio/test_dataset.csv
+++ b/tests/integration/datasetio/test_dataset.csv
@ -1,6 +1,6 @@
 input_query,generated_answer,expected_answer,chat_completion_input
-What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]"
+What is the capital of France?,London,Paris,"[{""role"": ""user"", ""content"": ""What is the capital of France?""}]"
-Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]"
+Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{""role"": ""user"", ""content"": ""Who is the CEO of Meta?""}]"
-What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]"
+What is the largest planet in our solar system?,Jupiter,Jupiter,"[{""role"": ""user"", ""content"": ""What is the largest planet in our solar system?""}]"
-What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]"
+What is the smallest country in the world?,China,Vatican City,"[{""role"": ""user"", ""content"": ""What is the smallest country in the world?""}]"
-What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]"
+What is the currency of Japan?,Yen,Yen,"[{""role"": ""user"", ""content"": ""What is the currency of Japan?""}]"
--- a/tests/integration/eval/test_eval.py
+++ b/tests/integration/eval/test_eval.py
@ -3,17 +3,58 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import uuid
 import pytest
 from ..datasetio.test_datasetio import register_dataset
 # How to run this test:
 #
 # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/eval
-def test_benchmarks_list(llama_stack_client):
+@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
-    response = llama_stack_client.benchmarks.list()
+def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
-    assert isinstance(response, list)
+    register_dataset(llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval")
-    assert len(response) == 0
+    response = llama_stack_client.datasets.list()
    assert any(x.identifier == "test_dataset_for_eval" for x in response)
    rows = llama_stack_client.datasetio.get_rows_paginated(
        dataset_id="test_dataset_for_eval",
        rows_in_page=3,
    )
    assert len(rows.rows) == 3
    scoring_functions = [
        scoring_fn_id,
    ]
    benchmark_id = str(uuid.uuid4())
    llama_stack_client.benchmarks.register(
        benchmark_id=benchmark_id,
        dataset_id="test_dataset_for_eval",
        scoring_functions=scoring_functions,
    )
    list_benchmarks = llama_stack_client.benchmarks.list()
    assert any(x.identifier == benchmark_id for x in list_benchmarks)
    response = llama_stack_client.eval.evaluate_rows(
        benchmark_id=benchmark_id,
        input_rows=rows.rows,
        scoring_functions=scoring_functions,
        benchmark_config={
            "eval_candidate": {
                "type": "model",
                "model": text_model_id,
                "sampling_params": {
                    "temperature": 0.0,
                },
            },
        },
    )
    assert len(response.generations) == 3
    assert scoring_fn_id in response.scores
 # @pytest.mark.skip(reason="FIXME FIXME @yanxi0830 this needs to be migrated to use the API")