From c79b08755220b0bad2f6c262187a09541318c2a8 Mon Sep 17 00:00:00 2001 From: Sixian Yi Date: Thu, 16 Jan 2025 12:05:49 -0800 Subject: [PATCH] [test automation] support run tests on config file (#730) # Context For test automation, the end goal is to run a single pytest command from root test directory (llama_stack/providers/tests/.) such that we execute push-blocking tests The work plan: 1) trigger pytest from llama_stack/providers/tests/. 2) use config file to determine what tests and parametrization we want to run # What does this PR do? 1) consolidates the "inference-models" / "embedding-model" / "judge-model" ... options in root conftest.py. Without this change, we will hit into error when trying to run `pytest /Users/sxyi/llama-stack/llama_stack/providers/tests/.` because of duplicated `addoptions` definitions across child conftest files. 2) Add a `config` option to specify test config in YAML. (see [`ci_test_config.yaml`](https://gist.github.com/sixianyi0721/5b37fbce4069139445c2f06f6e42f87e) for example config file) For provider_fixtures, we allow users to use either a default fixture combination or define their own {api:provider} combinations. ``` memory: .... fixtures: provider_fixtures: - default_fixture_param_id: ollama // use default fixture combination with param_id="ollama" in [providers/tests/memory/conftest.py](https://fburl.com/mtjzwsmk) - inference: sentence_transformers memory: faiss - default_fixture_param_id: chroma ``` 3) generate tests according to the config. Logic lives in two places: a) in `{api}/conftest.py::pytest_generate_tests`, we read from config to do parametrization. b) after test collection, in `pytest_collection_modifyitems`, we filter the tests to include only functions listed in config. ## Test Plan 1) `pytest /Users/sxyi/llama-stack/llama_stack/providers/tests/. --collect-only --config=ci_test_config.yaml` Using `--collect-only` tag to print the pytests listed in the config file (`ci_test_config.yaml`). output: [gist](https://gist.github.com/sixianyi0721/05145e60d4d085c17cfb304beeb1e60e) 2) sanity check on `--inference-model` option ``` pytest -v -s -k "ollama" --inference-model="meta-llama/Llama-3.1-8B-Instruct" ./llama_stack/providers/tests/inference/test_text_inference.py ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- .../distribution/routers/routing_tables.py | 2 +- llama_stack/providers/tests/README.md | 14 ++ .../providers/tests/agents/conftest.py | 39 +++-- .../tests/agents/test_persistence.py | 4 +- .../providers/tests/ci_test_config.yaml | 55 +++++++ llama_stack/providers/tests/conftest.py | 144 +++++++++++++++++- llama_stack/providers/tests/eval/conftest.py | 16 -- .../providers/tests/inference/conftest.py | 48 +++--- .../providers/tests/inference/fixtures.py | 1 + .../providers/tests/memory/conftest.py | 25 +-- .../tests/post_training/test_post_training.py | 2 +- .../providers/tests/safety/conftest.py | 9 -- .../providers/tests/scoring/conftest.py | 15 -- llama_stack/providers/tests/tools/conftest.py | 15 -- 14 files changed, 273 insertions(+), 116 deletions(-) create mode 100644 llama_stack/providers/tests/ci_test_config.yaml diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index e02606936..889bd4624 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -246,7 +246,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): provider_id = list(self.impls_by_provider_id.keys())[0] else: raise ValueError( - "No provider specified and multiple providers available. Please specify a provider_id. Available providers: {self.impls_by_provider_id.keys()}" + f"No provider specified and multiple providers available. Please specify a provider_id. Available providers: {self.impls_by_provider_id.keys()}" ) if metadata is None: metadata = {} diff --git a/llama_stack/providers/tests/README.md b/llama_stack/providers/tests/README.md index 4b406b321..e4e94a3fd 100644 --- a/llama_stack/providers/tests/README.md +++ b/llama_stack/providers/tests/README.md @@ -10,6 +10,8 @@ We use `pytest` and all of its dynamism to enable the features needed. Specifica - We use `pytest_configure` to make sure we dynamically add appropriate marks based on the fixtures we make. +- We use `pytest_collection_modifyitems` to filter tests based on the test config (if specified). + ## Common options All tests support a `--providers` option which can be a string of the form `api1=provider_fixture1,api2=provider_fixture2`. So, when testing safety (which need inference and safety APIs) you can use `--providers inference=together,safety=meta_reference` to use these fixtures in concert. @@ -73,3 +75,15 @@ If you wanted to test a remotely hosted stack, you can use `-m remote` as follow pytest -s -m remote llama_stack/providers/tests/agents/test_agents.py \ --env REMOTE_STACK_URL=<...> ``` + +## Test Config +If you want to run a test suite with a custom set of tests and parametrizations, you can define a YAML test config under llama_stack/providers/tests/ folder and pass the filename through `--config` option as follows: + +``` +pytest llama_stack/providers/tests/ --config=ci_test_config.yaml +``` + +### Test config format +Currently, we support test config on inference, agents and memory api tests. + +Example format of test config can be found in ci_test_config.yaml. diff --git a/llama_stack/providers/tests/agents/conftest.py b/llama_stack/providers/tests/agents/conftest.py index ecd05dcf8..4efdfe8b7 100644 --- a/llama_stack/providers/tests/agents/conftest.py +++ b/llama_stack/providers/tests/agents/conftest.py @@ -6,10 +6,15 @@ import pytest -from ..conftest import get_provider_fixture_overrides +from ..conftest import ( + get_provider_fixture_overrides, + get_provider_fixture_overrides_from_test_config, + get_test_config_for_api, +) from ..inference.fixtures import INFERENCE_FIXTURES from ..memory.fixtures import MEMORY_FIXTURES from ..safety.fixtures import SAFETY_FIXTURES, safety_model_from_shield + from ..tools.fixtures import TOOL_RUNTIME_FIXTURES from .fixtures import AGENTS_FIXTURES @@ -81,23 +86,15 @@ def pytest_configure(config): ) -def pytest_addoption(parser): - parser.addoption( - "--inference-model", - action="store", - default="meta-llama/Llama-3.2-3B-Instruct", - help="Specify the inference model to use for testing", - ) - parser.addoption( - "--safety-shield", - action="store", - default="meta-llama/Llama-Guard-3-1B", - help="Specify the safety shield to use for testing", - ) - - def pytest_generate_tests(metafunc): - shield_id = metafunc.config.getoption("--safety-shield") + test_config = get_test_config_for_api(metafunc.config, "agents") + shield_id = getattr( + test_config, "safety_shield", None + ) or metafunc.config.getoption("--safety-shield") + inference_models = getattr(test_config, "inference_models", None) or [ + metafunc.config.getoption("--inference-model") + ] + if "safety_shield" in metafunc.fixturenames: metafunc.parametrize( "safety_shield", @@ -105,8 +102,7 @@ def pytest_generate_tests(metafunc): indirect=True, ) if "inference_model" in metafunc.fixturenames: - inference_model = metafunc.config.getoption("--inference-model") - models = set({inference_model}) + models = set(inference_models) if safety_model := safety_model_from_shield(shield_id): models.add(safety_model) @@ -124,7 +120,10 @@ def pytest_generate_tests(metafunc): "tool_runtime": TOOL_RUNTIME_FIXTURES, } combinations = ( - get_provider_fixture_overrides(metafunc.config, available_fixtures) + get_provider_fixture_overrides_from_test_config( + metafunc.config, "agents", DEFAULT_PROVIDER_COMBINATIONS + ) + or get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS ) metafunc.parametrize("agents_stack", combinations, indirect=True) diff --git a/llama_stack/providers/tests/agents/test_persistence.py b/llama_stack/providers/tests/agents/test_persistence.py index 38eb7de55..e6b1470ef 100644 --- a/llama_stack/providers/tests/agents/test_persistence.py +++ b/llama_stack/providers/tests/agents/test_persistence.py @@ -9,7 +9,9 @@ import pytest from llama_stack.apis.agents import AgentConfig, Turn from llama_stack.apis.inference import SamplingParams, UserMessage from llama_stack.providers.datatypes import Api -from llama_stack.providers.utils.kvstore import kvstore_impl, SqliteKVStoreConfig +from llama_stack.providers.utils.kvstore import kvstore_impl +from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig + from .fixtures import pick_inference_model from .utils import create_agent_session diff --git a/llama_stack/providers/tests/ci_test_config.yaml b/llama_stack/providers/tests/ci_test_config.yaml new file mode 100644 index 000000000..3edcd38bf --- /dev/null +++ b/llama_stack/providers/tests/ci_test_config.yaml @@ -0,0 +1,55 @@ +inference: + tests: + - inference/test_vision_inference.py::test_vision_chat_completion_streaming + - inference/test_vision_inference.py::test_vision_chat_completion_non_streaming + - inference/test_text_inference.py::test_structured_output + - inference/test_text_inference.py::test_chat_completion_streaming + - inference/test_text_inference.py::test_chat_completion_non_streaming + - inference/test_text_inference.py::test_chat_completion_with_tool_calling + - inference/test_text_inference.py::test_chat_completion_with_tool_calling_streaming + + scenarios: + - provider_fixtures: + inference: ollama + - fixture_combo_id: fireworks + - provider_fixtures: + inference: together + # - inference: tgi + # - inference: vllm_remote + + inference_models: + - meta-llama/Llama-3.1-8B-Instruct + - meta-llama/Llama-3.2-11B-Vision-Instruct + + +agents: + tests: + - agents/test_agents.py::test_agent_turns_with_safety + - agents/test_agents.py::test_rag_agent + + scenarios: + - fixture_combo_id: ollama + - fixture_combo_id: together + - fixture_combo_id: fireworks + + inference_models: + - meta-llama/Llama-3.2-1B-Instruct + + safety_shield: meta-llama/Llama-Guard-3-1B + + +memory: + tests: + - memory/test_memory.py::test_query_documents + + scenarios: + - fixture_combo_id: ollama + - provider_fixtures: + inference: sentence_transformers + memory: faiss + - fixture_combo_id: chroma + + inference_models: + - meta-llama/Llama-3.2-1B-Instruct + + embedding_model: all-MiniLM-L6-v2 diff --git a/llama_stack/providers/tests/conftest.py b/llama_stack/providers/tests/conftest.py index 7408a6375..9530695e1 100644 --- a/llama_stack/providers/tests/conftest.py +++ b/llama_stack/providers/tests/conftest.py @@ -5,12 +5,16 @@ # the root directory of this source tree. import os +from collections import defaultdict + from pathlib import Path from typing import Any, Dict, List, Optional import pytest +import yaml + from dotenv import load_dotenv -from pydantic import BaseModel +from pydantic import BaseModel, Field from termcolor import colored from llama_stack.distribution.datatypes import Provider @@ -24,6 +28,83 @@ class ProviderFixture(BaseModel): provider_data: Optional[Dict[str, Any]] = None +class TestScenario(BaseModel): + # provider fixtures can be either a mark or a dictionary of api -> providers + provider_fixtures: Dict[str, str] = Field(default_factory=dict) + fixture_combo_id: Optional[str] = None + + +class APITestConfig(BaseModel): + scenarios: List[TestScenario] = Field(default_factory=list) + inference_models: List[str] = Field(default_factory=list) + + # test name format should be :: + tests: List[str] = Field(default_factory=list) + + +class MemoryApiTestConfig(APITestConfig): + embedding_model: Optional[str] = Field(default_factory=None) + + +class AgentsApiTestConfig(APITestConfig): + safety_shield: Optional[str] = Field(default_factory=None) + + +class TestConfig(BaseModel): + inference: Optional[APITestConfig] = None + agents: Optional[AgentsApiTestConfig] = None + memory: Optional[MemoryApiTestConfig] = None + + +def get_test_config_from_config_file(metafunc_config): + config_file = metafunc_config.getoption("--config") + if config_file is None: + return None + + config_file_path = Path(__file__).parent / config_file + if not config_file_path.exists(): + raise ValueError( + f"Test config {config_file} was specified but not found. Please make sure it exists in the llama_stack/providers/tests directory." + ) + with open(config_file_path, "r") as config_file: + config = yaml.safe_load(config_file) + return TestConfig(**config) + + +def get_test_config_for_api(metafunc_config, api): + test_config = get_test_config_from_config_file(metafunc_config) + if test_config is None: + return None + return getattr(test_config, api) + + +def get_provider_fixture_overrides_from_test_config( + metafunc_config, api, default_provider_fixture_combinations +): + api_config = get_test_config_for_api(metafunc_config, api) + if api_config is None: + return None + + fixture_combo_ids = set() + custom_provider_fixture_combos = [] + for scenario in api_config.scenarios: + if scenario.fixture_combo_id: + fixture_combo_ids.add(scenario.fixture_combo_id) + else: + custom_provider_fixture_combos.append( + pytest.param( + scenario.provider_fixtures, + id=scenario.provider_fixtures.get("inference") or "", + ) + ) + + if len(fixture_combo_ids) > 0: + for default_fixture in default_provider_fixture_combinations: + if default_fixture.id in fixture_combo_ids: + custom_provider_fixture_combos.append(default_fixture) + return custom_provider_fixture_combos + + def remote_stack_fixture() -> ProviderFixture: if url := os.getenv("REMOTE_STACK_URL", None): config = RemoteProviderConfig.from_url(url) @@ -69,10 +150,39 @@ def pytest_addoption(parser): "Example: --providers inference=ollama,safety=meta-reference" ), ) + parser.addoption( + "--config", + action="store", + help="Set test config file (supported format: YAML), e.g. --config=test_config.yml", + ) """Add custom command line options""" parser.addoption( "--env", action="append", help="Set environment variables, e.g. --env KEY=value" ) + parser.addoption( + "--inference-model", + action="store", + default="meta-llama/Llama-3.2-3B-Instruct", + help="Specify the inference model to use for testing", + ) + parser.addoption( + "--safety-shield", + action="store", + default="meta-llama/Llama-Guard-3-1B", + help="Specify the safety shield to use for testing", + ) + parser.addoption( + "--embedding-model", + action="store", + default=None, + help="Specify the embedding model to use for testing", + ) + parser.addoption( + "--judge-model", + action="store", + default="meta-llama/Llama-3.1-8B-Instruct", + help="Specify the judge model to use for testing", + ) def make_provider_id(providers: Dict[str, str]) -> str: @@ -148,6 +258,38 @@ def pytest_itemcollected(item): item.name = f"{item.name}[{marks}]" +def pytest_collection_modifyitems(session, config, items): + test_config = get_test_config_from_config_file(config) + if test_config is None: + return + + required_tests = defaultdict(set) + for api_test_config in [ + test_config.inference, + test_config.memory, + test_config.agents, + ]: + if api_test_config is None: + continue + for test in api_test_config.tests: + arr = test.split("::") + if len(arr) != 2: + raise ValueError(f"Invalid format for test name {test}") + test_path, func_name = arr + required_tests[Path(__file__).parent / test_path].add(func_name) + + new_items, deselected_items = [], [] + for item in items: + func_name = getattr(item, "originalname", item.name) + if func_name in required_tests[item.fspath]: + new_items.append(item) + continue + deselected_items.append(item) + + items[:] = new_items + config.hook.pytest_deselected(items=deselected_items) + + pytest_plugins = [ "llama_stack.providers.tests.inference.fixtures", "llama_stack.providers.tests.safety.fixtures", diff --git a/llama_stack/providers/tests/eval/conftest.py b/llama_stack/providers/tests/eval/conftest.py index 3d6ef01b2..b7a68965e 100644 --- a/llama_stack/providers/tests/eval/conftest.py +++ b/llama_stack/providers/tests/eval/conftest.py @@ -76,22 +76,6 @@ def pytest_configure(config): ) -def pytest_addoption(parser): - parser.addoption( - "--inference-model", - action="store", - default="meta-llama/Llama-3.2-3B-Instruct", - help="Specify the inference model to use for testing", - ) - - parser.addoption( - "--judge-model", - action="store", - default="meta-llama/Llama-3.1-8B-Instruct", - help="Specify the judge model to use for testing", - ) - - def pytest_generate_tests(metafunc): if "eval_stack" in metafunc.fixturenames: available_fixtures = { diff --git a/llama_stack/providers/tests/inference/conftest.py b/llama_stack/providers/tests/inference/conftest.py index 54ebcd83a..1303a1b35 100644 --- a/llama_stack/providers/tests/inference/conftest.py +++ b/llama_stack/providers/tests/inference/conftest.py @@ -6,26 +6,10 @@ import pytest -from ..conftest import get_provider_fixture_overrides - +from ..conftest import get_provider_fixture_overrides, get_test_config_for_api from .fixtures import INFERENCE_FIXTURES -def pytest_addoption(parser): - parser.addoption( - "--inference-model", - action="store", - default=None, - help="Specify the inference model to use for testing", - ) - parser.addoption( - "--embedding-model", - action="store", - default=None, - help="Specify the embedding model to use for testing", - ) - - def pytest_configure(config): for model in ["llama_8b", "llama_3b", "llama_vision"]: config.addinivalue_line( @@ -58,16 +42,21 @@ VISION_MODEL_PARAMS = [ def pytest_generate_tests(metafunc): + test_config = get_test_config_for_api(metafunc.config, "inference") + if "inference_model" in metafunc.fixturenames: - model = metafunc.config.getoption("--inference-model") - if model: + cls_name = metafunc.cls.__name__ + params = [] + inference_models = getattr(test_config, "inference_models", []) + for model in inference_models: + if ("Vision" in cls_name and "Vision" in model) or ( + "Vision" not in cls_name and "Vision" not in model + ): + params.append(pytest.param(model, id=model)) + + if not params: + model = metafunc.config.getoption("--inference-model") params = [pytest.param(model, id="")] - else: - cls_name = metafunc.cls.__name__ - if "Vision" in cls_name: - params = VISION_MODEL_PARAMS - else: - params = MODEL_PARAMS metafunc.parametrize( "inference_model", @@ -83,4 +72,13 @@ def pytest_generate_tests(metafunc): }, ): fixtures = [stack.values[0]["inference"] for stack in filtered_stacks] + if test_config: + if custom_fixtures := [ + ( + scenario.fixture_combo_id + or scenario.provider_fixtures.get("inference") + ) + for scenario in test_config.scenarios + ]: + fixtures = custom_fixtures metafunc.parametrize("inference_stack", fixtures, indirect=True) diff --git a/llama_stack/providers/tests/inference/fixtures.py b/llama_stack/providers/tests/inference/fixtures.py index b6653b65d..0767e940f 100644 --- a/llama_stack/providers/tests/inference/fixtures.py +++ b/llama_stack/providers/tests/inference/fixtures.py @@ -301,6 +301,7 @@ async def inference_stack(request, inference_model): inference_fixture.provider_data, models=[ ModelInput( + provider_id=inference_fixture.providers[0].provider_id, model_id=inference_model, model_type=model_type, metadata=metadata, diff --git a/llama_stack/providers/tests/memory/conftest.py b/llama_stack/providers/tests/memory/conftest.py index 9b6ba177d..87dec4beb 100644 --- a/llama_stack/providers/tests/memory/conftest.py +++ b/llama_stack/providers/tests/memory/conftest.py @@ -6,7 +6,11 @@ import pytest -from ..conftest import get_provider_fixture_overrides +from ..conftest import ( + get_provider_fixture_overrides, + get_provider_fixture_overrides_from_test_config, + get_test_config_for_api, +) from ..inference.fixtures import INFERENCE_FIXTURES from .fixtures import MEMORY_FIXTURES @@ -56,15 +60,6 @@ DEFAULT_PROVIDER_COMBINATIONS = [ ] -def pytest_addoption(parser): - parser.addoption( - "--embedding-model", - action="store", - default=None, - help="Specify the embedding model to use for testing", - ) - - def pytest_configure(config): for fixture_name in MEMORY_FIXTURES: config.addinivalue_line( @@ -74,8 +69,11 @@ def pytest_configure(config): def pytest_generate_tests(metafunc): + test_config = get_test_config_for_api(metafunc.config, "memory") if "embedding_model" in metafunc.fixturenames: - model = metafunc.config.getoption("--embedding-model") + model = getattr(test_config, "embedding_model", None) + # Fall back to the default if not specified by the config file + model = model or metafunc.config.getoption("--embedding-model") if model: params = [pytest.param(model, id="")] else: @@ -89,7 +87,10 @@ def pytest_generate_tests(metafunc): "memory": MEMORY_FIXTURES, } combinations = ( - get_provider_fixture_overrides(metafunc.config, available_fixtures) + get_provider_fixture_overrides_from_test_config( + metafunc.config, "memory", DEFAULT_PROVIDER_COMBINATIONS + ) + or get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS ) metafunc.parametrize("memory_stack", combinations, indirect=True) diff --git a/llama_stack/providers/tests/post_training/test_post_training.py b/llama_stack/providers/tests/post_training/test_post_training.py index 0645cd555..0c58c1fa0 100644 --- a/llama_stack/providers/tests/post_training/test_post_training.py +++ b/llama_stack/providers/tests/post_training/test_post_training.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import pytest -from llama_stack.apis.common.type_system import JobStatus +from llama_stack.apis.common.job_types import JobStatus from llama_stack.apis.post_training import ( Checkpoint, DataConfig, diff --git a/llama_stack/providers/tests/safety/conftest.py b/llama_stack/providers/tests/safety/conftest.py index 6846517e3..a5e77f570 100644 --- a/llama_stack/providers/tests/safety/conftest.py +++ b/llama_stack/providers/tests/safety/conftest.py @@ -64,15 +64,6 @@ def pytest_configure(config): ) -def pytest_addoption(parser): - parser.addoption( - "--safety-shield", - action="store", - default=None, - help="Specify the safety shield to use for testing", - ) - - SAFETY_SHIELD_PARAMS = [ pytest.param( "meta-llama/Llama-Guard-3-1B", marks=pytest.mark.guard_1b, id="guard_1b" diff --git a/llama_stack/providers/tests/scoring/conftest.py b/llama_stack/providers/tests/scoring/conftest.py index dc4979dd7..0b4e7d46e 100644 --- a/llama_stack/providers/tests/scoring/conftest.py +++ b/llama_stack/providers/tests/scoring/conftest.py @@ -55,21 +55,6 @@ def pytest_configure(config): ) -def pytest_addoption(parser): - parser.addoption( - "--inference-model", - action="store", - default="meta-llama/Llama-3.2-3B-Instruct", - help="Specify the inference model to use for testing", - ) - parser.addoption( - "--judge-model", - action="store", - default="meta-llama/Llama-3.1-8B-Instruct", - help="Specify the judge model to use for testing", - ) - - def pytest_generate_tests(metafunc): judge_model = metafunc.config.getoption("--judge-model") if "judge_model" in metafunc.fixturenames: diff --git a/llama_stack/providers/tests/tools/conftest.py b/llama_stack/providers/tests/tools/conftest.py index 11aad5ab6..525abe8ab 100644 --- a/llama_stack/providers/tests/tools/conftest.py +++ b/llama_stack/providers/tests/tools/conftest.py @@ -34,21 +34,6 @@ def pytest_configure(config): ) -def pytest_addoption(parser): - parser.addoption( - "--inference-model", - action="store", - default="meta-llama/Llama-3.2-3B-Instruct", - help="Specify the inference model to use for testing", - ) - parser.addoption( - "--safety-shield", - action="store", - default="meta-llama/Llama-Guard-3-1B", - help="Specify the safety shield to use for testing", - ) - - def pytest_generate_tests(metafunc): if "tools_stack" in metafunc.fixturenames: available_fixtures = {