refactor(test): move tools, evals, datasetio, scoring and post training tests (#1401)

All of the tests from `llama_stack/providers/tests/` are now moved to
`tests/integration`.

I converted the `tools`, `scoring` and `datasetio` tests to use API.
However, `eval` and `post_training` proved to be a bit challenging to
leaving those. I think `post_training` should be relatively
straightforward also.

As part of this, I noticed that `wolfram_alpha` tool wasn't added to
some of our commonly used distros so I added it. I am going to remove a
lot of code duplication from distros next so while this looks like a
one-off right now, it will go away and be there uniformly for all
distros.
This commit is contained in:
Ashwin Bharambe 2025-03-04 14:53:47 -08:00 committed by GitHub
parent dd0db8038b
commit abfbaf3c1b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
51 changed files with 471 additions and 1245 deletions

View file

@ -20,7 +20,7 @@ from llama_stack.distribution.datatypes import Provider, StackRunConfig
from llama_stack.distribution.distribution import get_provider_registry
from llama_stack.distribution.stack import replace_env_vars
from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.providers.tests.env import get_env_or_fail
from llama_stack.env import get_env_or_fail
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
from .fixtures.recordable_mock import RecordableMock
@ -84,6 +84,11 @@ def pytest_addoption(parser):
default=None,
help="Specify the embedding model to use for testing",
)
parser.addoption(
"--judge-model",
default=None,
help="Specify the judge model to use for testing",
)
parser.addoption(
"--embedding-dimension",
type=int,
@ -109,6 +114,7 @@ def provider_data():
"TOGETHER_API_KEY": "together_api_key",
"ANTHROPIC_API_KEY": "anthropic_api_key",
"GROQ_API_KEY": "groq_api_key",
"WOLFRAM_ALPHA_API_KEY": "wolfram_alpha_api_key",
}
provider_data = {}
for key, value in keymap.items():
@ -260,7 +266,9 @@ def inference_provider_type(llama_stack_client):
@pytest.fixture(scope="session")
def client_with_models(llama_stack_client, text_model_id, vision_model_id, embedding_model_id, embedding_dimension):
def client_with_models(
llama_stack_client, text_model_id, vision_model_id, embedding_model_id, embedding_dimension, judge_model_id
):
client = llama_stack_client
providers = [p for p in client.providers.list() if p.api == "inference"]
@ -274,6 +282,8 @@ def client_with_models(llama_stack_client, text_model_id, vision_model_id, embed
client.models.register(model_id=text_model_id, provider_id=inference_providers[0])
if vision_model_id and vision_model_id not in model_ids:
client.models.register(model_id=vision_model_id, provider_id=inference_providers[0])
if judge_model_id and judge_model_id not in model_ids:
client.models.register(model_id=judge_model_id, provider_id=inference_providers[0])
if embedding_model_id and embedding_dimension and embedding_model_id not in model_ids:
# try to find a provider that supports embeddings, if sentence-transformers is not available
@ -328,6 +338,14 @@ def pytest_generate_tests(metafunc):
if val is not None:
id_parts.append(f"emb={get_short_id(val)}")
if "judge_model_id" in metafunc.fixturenames:
params.append("judge_model_id")
val = metafunc.config.getoption("--judge-model")
print(f"judge_model_id: {val}")
values.append(val)
if val is not None:
id_parts.append(f"judge={get_short_id(val)}")
if "embedding_dimension" in metafunc.fixturenames:
params.append("embedding_dimension")
val = metafunc.config.getoption("--embedding-dimension")