work eval

2025-08-12 04:50:39 +00:00 · 2025-03-05 17:12:47 -08:00 · 2025-03-05 17:12:47 -08:00 · 6e65b9282d
commit 6e65b9282d
parent fd68b0dc9a
2 changed files with 50 additions and 9 deletions
--- a/tests/integration/datasetio/test_dataset.csv
+++ b/tests/integration/datasetio/test_dataset.csv
@ -1,6 +1,6 @@
 input_query,generated_answer,expected_answer,chat_completion_input
-What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]"
-Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]"
-What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]"
-What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]"
-What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]"
+What is the capital of France?,London,Paris,"[{""role"": ""user"", ""content"": ""What is the capital of France?""}]"
+Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{""role"": ""user"", ""content"": ""Who is the CEO of Meta?""}]"
+What is the largest planet in our solar system?,Jupiter,Jupiter,"[{""role"": ""user"", ""content"": ""What is the largest planet in our solar system?""}]"
+What is the smallest country in the world?,China,Vatican City,"[{""role"": ""user"", ""content"": ""What is the smallest country in the world?""}]"
+What is the currency of Japan?,Yen,Yen,"[{""role"": ""user"", ""content"": ""What is the currency of Japan?""}]"
--- a/tests/integration/eval/test_eval.py
+++ b/tests/integration/eval/test_eval.py
@ -3,17 +3,58 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import uuid

+import pytest
+
+from ..datasetio.test_datasetio import register_dataset

 # How to run this test:
 #
 # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/eval


-def test_benchmarks_list(llama_stack_client):
-    response = llama_stack_client.benchmarks.list()
-    assert isinstance(response, list)
-    assert len(response) == 0
+@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
+def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
+    register_dataset(llama_stack_client, for_generation=True, dataset_id="test_dataset_for_eval")
+    response = llama_stack_client.datasets.list()
+    assert any(x.identifier == "test_dataset_for_eval" for x in response)
+
+    rows = llama_stack_client.datasetio.get_rows_paginated(
+        dataset_id="test_dataset_for_eval",
+        rows_in_page=3,
+    )
+    assert len(rows.rows) == 3
+
+    scoring_functions = [
+        scoring_fn_id,
+    ]
+    benchmark_id = str(uuid.uuid4())
+    llama_stack_client.benchmarks.register(
+        benchmark_id=benchmark_id,
+        dataset_id="test_dataset_for_eval",
+        scoring_functions=scoring_functions,
+    )
+    list_benchmarks = llama_stack_client.benchmarks.list()
+    assert any(x.identifier == benchmark_id for x in list_benchmarks)
+
+    response = llama_stack_client.eval.evaluate_rows(
+        benchmark_id=benchmark_id,
+        input_rows=rows.rows,
+        scoring_functions=scoring_functions,
+        benchmark_config={
+            "eval_candidate": {
+                "type": "model",
+                "model": text_model_id,
+                "sampling_params": {
+                    "temperature": 0.0,
+                },
+            },
+        },
+    )
+
+    assert len(response.generations) == 3
+    assert scoring_fn_id in response.scores


 # @pytest.mark.skip(reason="FIXME FIXME @yanxi0830 this needs to be migrated to use the API")