test: revamp eval related integration tests (#1433)

# What does this PR do? - revamp and clean up datasets/scoring/eval integration tests - closes https://github.com/meta-llama/llama-stack/issues/1396 [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan **dataset** ``` LLAMA_STACK_BASE_URL=http://localhost:8321 pytest -v tests/integration/datasetio/ ``` <img width="842" alt="image" src="https://github.com/user-attachments/assets/88fc2b6a-b496-47bf-bc0c-8fea48ba36ff" /> **scoring** ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/scoring --text-model meta-llama/Llama-3.1-8B-Instruct --judge-model meta-llama/Llama-3.1-8B-Instruct ``` <img width="851" alt="image" src="https://github.com/user-attachments/assets/50f46415-b44c-4c37-a6c3-076f2767adb3" /> **eval** ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/eval --text-model meta-llama/Llama-3.1-8B-Instruct --judge-model meta-llama/Llama-3.1-8B-Instruct ``` <img width="841" alt="image" src="https://github.com/user-attachments/assets/8eb1c65c-3b39-4d66-8ff4-f471ca783e49" /> [//]: # (## Documentation)
2025-12-03 18:00:36 +00:00 · 2025-03-06 10:51:35 -08:00 · 2025-03-06 10:51:35 -08:00 · bcb13c492f
commit bcb13c492f
parent 82e94fe22f
7 changed files with 184 additions and 222 deletions
--- a/tests/integration/datasetio/test_dataset.csv
+++ b/tests/integration/datasetio/test_dataset.csv
@ -1,6 +1,6 @@
 input_query,generated_answer,expected_answer,chat_completion_input
-What is the capital of France?,London,Paris,"[{'role': 'user', 'content': 'What is the capital of France?'}]"
-Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{'role': 'user', 'content': 'Who is the CEO of Meta?'}]"
-What is the largest planet in our solar system?,Jupiter,Jupiter,"[{'role': 'user', 'content': 'What is the largest planet in our solar system?'}]"
-What is the smallest country in the world?,China,Vatican City,"[{'role': 'user', 'content': 'What is the smallest country in the world?'}]"
-What is the currency of Japan?,Yen,Yen,"[{'role': 'user', 'content': 'What is the currency of Japan?'}]"
+What is the capital of France?,London,Paris,"[{""role"": ""user"", ""content"": ""What is the capital of France?""}]"
+Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{""role"": ""user"", ""content"": ""Who is the CEO of Meta?""}]"
+What is the largest planet in our solar system?,Jupiter,Jupiter,"[{""role"": ""user"", ""content"": ""What is the largest planet in our solar system?""}]"
+What is the smallest country in the world?,China,Vatican City,"[{""role"": ""user"", ""content"": ""What is the smallest country in the world?""}]"
+What is the currency of Japan?,Yen,Yen,"[{""role"": ""user"", ""content"": ""What is the currency of Japan?""}]"
--- a/tests/integration/datasetio/test_datasetio.py
+++ b/tests/integration/datasetio/test_datasetio.py
@ -9,13 +9,9 @@ import mimetypes
 import os
 from pathlib import Path

-import pytest
-
 # How to run this test:
 #
-# pytest llama_stack/providers/tests/datasetio/test_datasetio.py
-#   -m "meta_reference"
-#   -v -s --tb=short --disable-warnings
+# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasetio


 def data_url_from_file(file_path: str) -> str:
@ -60,42 +56,29 @@ def register_dataset(llama_stack_client, for_generation=False, for_rag=False, da
            "generated_answer": {"type": "string"},
        }

+    dataset_providers = [x for x in llama_stack_client.providers.list() if x.api == "datasetio"]
+    dataset_provider_id = dataset_providers[0].provider_id
+
    llama_stack_client.datasets.register(
        dataset_id=dataset_id,
        dataset_schema=dataset_schema,
        url=dict(uri=test_url),
-        provider_id="localfs",
+        provider_id=dataset_provider_id,
    )


-def test_datasets_list(llama_stack_client):
-    # NOTE: this needs you to ensure that you are starting from a clean state
-    # but so far we don't have an unregister API unfortunately, so be careful
-
-    response = llama_stack_client.datasets.list()
-    assert isinstance(response, list)
-    assert len(response) == 0
-
-
-def test_register_dataset(llama_stack_client):
+def test_register_unregister_dataset(llama_stack_client):
    register_dataset(llama_stack_client)
    response = llama_stack_client.datasets.list()
    assert isinstance(response, list)
    assert len(response) == 1
    assert response[0].identifier == "test_dataset"

-    with pytest.raises(ValueError):
-        # unregister a dataset that does not exist
-        llama_stack_client.datasets.unregister("test_dataset2")
-
    llama_stack_client.datasets.unregister("test_dataset")
    response = llama_stack_client.datasets.list()
    assert isinstance(response, list)
    assert len(response) == 0

-    with pytest.raises(ValueError):
-        llama_stack_client.datasets.unregister("test_dataset")
-

 def test_get_rows_paginated(llama_stack_client):
    register_dataset(llama_stack_client)