feat(tests): implement test isolation for inference recordings (#3681)

Uses test_id in request hashes and test-scoped subdirectories to prevent
cross-test contamination. Model list endpoints exclude test_id to enable
merging recordings from different servers.

Additionally, this PR adds a `record-if-missing` mode (which we will use
instead of `record` which records everything) which is very useful.

🤖 Co-authored with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Ashwin Bharambe 2025-10-04 11:34:18 -07:00 committed by GitHub
parent f176196fba
commit 045a0c1d57
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
428 changed files with 85345 additions and 104330 deletions

View file

@ -36,6 +36,24 @@ def pytest_sessionstart(session):
os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = "replay"
@pytest.fixture(autouse=True)
def _track_test_context(request):
"""Automatically track current test context for isolated recordings.
This fixture runs for every test and stores the test's nodeid in a contextvar
that the recording system can access to determine which subdirectory to use.
"""
from llama_stack.testing.inference_recorder import _test_context
# Store the test nodeid (e.g., "tests/integration/responses/test_basic.py::test_foo[params]")
token = _test_context.set(request.node.nodeid)
yield
# Cleanup
_test_context.reset(token)
def pytest_runtest_teardown(item):
# Check if the test actually ran and passed or failed, but was not skipped or an expected failure (xfail)
outcome = getattr(item, "execution_outcome", None)
@ -137,8 +155,8 @@ def pytest_addoption(parser):
parser.addoption(
"--inference-mode",
help="Inference mode: { record, replay, live } (default: replay)",
choices=["record", "replay", "live"],
help="Inference mode: { record, replay, live, record-if-missing } (default: replay)",
choices=["record", "replay", "live", "record-if-missing"],
default="replay",
)
parser.addoption(