refactor(test): move tools, evals, datasetio, scoring and post training tests (#1401)

All of the tests from `llama_stack/providers/tests/` are now moved to `tests/integration`. I converted the `tools`, `scoring` and `datasetio` tests to use API. However, `eval` and `post_training` proved to be a bit challenging to leaving those. I think `post_training` should be relatively straightforward also. As part of this, I noticed that `wolfram_alpha` tool wasn't added to some of our commonly used distros so I added it. I am going to remove a lot of code duplication from distros next so while this looks like a one-off right now, it will go away and be there uniformly for all distros.
2025-12-03 09:53:45 +00:00 · 2025-03-04 14:53:47 -08:00 · 2025-03-04 14:53:47 -08:00 · abfbaf3c1b
commit abfbaf3c1b
parent dd0db8038b
51 changed files with 471 additions and 1245 deletions
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -20,7 +20,7 @@ from llama_stack.distribution.datatypes import Provider, StackRunConfig
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.stack import replace_env_vars
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
-from llama_stack.providers.tests.env import get_env_or_fail
+from llama_stack.env import get_env_or_fail
 from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig

 from .fixtures.recordable_mock import RecordableMock
@ -84,6 +84,11 @@ def pytest_addoption(parser):
        default=None,
        help="Specify the embedding model to use for testing",
    )
+    parser.addoption(
+        "--judge-model",
+        default=None,
+        help="Specify the judge model to use for testing",
+    )
    parser.addoption(
        "--embedding-dimension",
        type=int,
@ -109,6 +114,7 @@ def provider_data():
        "TOGETHER_API_KEY": "together_api_key",
        "ANTHROPIC_API_KEY": "anthropic_api_key",
        "GROQ_API_KEY": "groq_api_key",
+        "WOLFRAM_ALPHA_API_KEY": "wolfram_alpha_api_key",
    }
    provider_data = {}
    for key, value in keymap.items():
@ -260,7 +266,9 @@ def inference_provider_type(llama_stack_client):


@pytest.fixture(scope="session")
-def client_with_models(llama_stack_client, text_model_id, vision_model_id, embedding_model_id, embedding_dimension):
+def client_with_models(
+    llama_stack_client, text_model_id, vision_model_id, embedding_model_id, embedding_dimension, judge_model_id
+):
    client = llama_stack_client

    providers = [p for p in client.providers.list() if p.api == "inference"]
@ -274,6 +282,8 @@ def client_with_models(llama_stack_client, text_model_id, vision_model_id, embed
        client.models.register(model_id=text_model_id, provider_id=inference_providers[0])
    if vision_model_id and vision_model_id not in model_ids:
        client.models.register(model_id=vision_model_id, provider_id=inference_providers[0])
+    if judge_model_id and judge_model_id not in model_ids:
+        client.models.register(model_id=judge_model_id, provider_id=inference_providers[0])

    if embedding_model_id and embedding_dimension and embedding_model_id not in model_ids:
        # try to find a provider that supports embeddings, if sentence-transformers is not available
@ -328,6 +338,14 @@ def pytest_generate_tests(metafunc):
        if val is not None:
            id_parts.append(f"emb={get_short_id(val)}")

+    if "judge_model_id" in metafunc.fixturenames:
+        params.append("judge_model_id")
+        val = metafunc.config.getoption("--judge-model")
+        print(f"judge_model_id: {val}")
+        values.append(val)
+        if val is not None:
+            id_parts.append(f"judge={get_short_id(val)}")
+
    if "embedding_dimension" in metafunc.fixturenames:
        params.append("embedding_dimension")
        val = metafunc.config.getoption("--embedding-dimension")