fix(tests): ensure test isolation in server mode

Propagate test IDs from client to server via HTTP headers to maintain proper test isolation when running with server-based stack configs. Without this, recorded/replayed inference requests in server mode would leak across tests. Changes: - Patch client _prepare_request to inject test ID into provider data header - Sync test context from provider data on server side before storage operations - Set LLAMA_STACK_TEST_STACK_CONFIG_TYPE env var based on stack config - Configure console width for cleaner log output in CI - Add SQLITE_STORE_DIR temp directory for test data isolation
2025-12-14 20:52:37 +00:00 · 2025-10-08 11:00:49 -07:00 · 2025-10-08 11:00:49 -07:00 · d5296a35f6
commit d5296a35f6
parent bba9957edd
4 changed files with 219 additions and 106 deletions
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@ -128,7 +128,10 @@ def strip_rich_markup(text):
 class CustomRichHandler(RichHandler):
    def __init__(self, *args, **kwargs):
-        kwargs["console"] = Console()
+        # Set a reasonable default width for console output, especially when redirected to files
        console_width = int(os.environ.get("LLAMA_STACK_LOG_WIDTH", "120"))
        # Don't force terminal codes to avoid ANSI escape codes in log files
        kwargs["console"] = Console(width=console_width)
        super().__init__(*args, **kwargs)
    def emit(self, record):
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -15,7 +15,7 @@ from enum import StrEnum
 from pathlib import Path
 from typing import Any, Literal, cast
-from openai import NOT_GIVEN
+from openai import NOT_GIVEN, OpenAI
 from llama_stack.log import get_logger
@ -79,6 +79,96 @@ def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict
    return hashlib.sha256(normalized_json.encode()).hexdigest()
 def _sync_test_context_from_provider_data():
    """In server mode, sync test ID from provider_data to _test_context.
    This ensures that storage operations (which read from _test_context) work correctly
    in server mode where the test ID arrives via HTTP header → provider_data.
    Returns a token to reset _test_context, or None if no sync was needed.
    """
    stack_config_type = os.environ.get("LLAMA_STACK_TEST_STACK_CONFIG_TYPE", "library_client")
    if stack_config_type != "server":
        return None
    try:
        from llama_stack.core.request_headers import PROVIDER_DATA_VAR
        provider_data = PROVIDER_DATA_VAR.get()
        if provider_data and "__test_id" in provider_data:
            test_id = provider_data["__test_id"]
            return _test_context.set(test_id)
    except ImportError:
        pass
    return None
 def patch_httpx_for_test_id():
    """Patch client _prepare_request methods to inject test ID into provider data header.
    This is needed for server mode where the test ID must be transported from
    client to server via HTTP headers. In library_client mode, this patch is a no-op
    since everything runs in the same process.
    We use the _prepare_request hook that Stainless clients provide for mutating
    requests after construction but before sending.
    """
    from llama_stack_client import LlamaStackClient
    if "llama_stack_client_prepare_request" in _original_methods:
        return
    _original_methods["llama_stack_client_prepare_request"] = LlamaStackClient._prepare_request
    _original_methods["openai_prepare_request"] = OpenAI._prepare_request
    def patched_prepare_request(self, request):
        # Call original first (it's a sync method that returns None)
        # Determine which original to call based on client type
        if "llama_stack_client" in self.__class__.__module__:
            _original_methods["llama_stack_client_prepare_request"](self, request)
            _original_methods["openai_prepare_request"](self, request)
        # Only inject test ID in server mode
        stack_config_type = os.environ.get("LLAMA_STACK_TEST_STACK_CONFIG_TYPE", "library_client")
        test_id = _test_context.get()
        if stack_config_type == "server" and test_id:
            provider_data_header = request.headers.get("X-LlamaStack-Provider-Data")
            if provider_data_header:
                provider_data = json.loads(provider_data_header)
            else:
                provider_data = {}
            provider_data["__test_id"] = test_id
            request.headers["X-LlamaStack-Provider-Data"] = json.dumps(provider_data)
        return None
    LlamaStackClient._prepare_request = patched_prepare_request
    OpenAI._prepare_request = patched_prepare_request
 # currently, unpatch is never called
 def unpatch_httpx_for_test_id():
    """Remove client _prepare_request patches for test ID injection."""
    if "llama_stack_client_prepare_request" not in _original_methods:
        return
    from llama_stack_client import LlamaStackClient
    LlamaStackClient._prepare_request = _original_methods["llama_stack_client_prepare_request"]
    del _original_methods["llama_stack_client_prepare_request"]
    # Also restore OpenAI client if it was patched
    if "openai_prepare_request" in _original_methods:
        OpenAI._prepare_request = _original_methods["openai_prepare_request"]
        del _original_methods["openai_prepare_request"]
 def get_inference_mode() -> InferenceMode:
    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())
@ -244,7 +334,7 @@ class ResponseStorage:
        with open(response_path, "w") as f:
            json.dump(
                {
-                    "test_id": _test_context.get(),  # Include for debugging
+                    "test_id": _test_context.get(),
                    "request": request,
                    "response": serialized_response,
                },
@ -386,6 +476,10 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
        else:
            return await original_method(self, *args, **kwargs)
    # In server mode, sync test ID from provider_data to _test_context for storage operations
    test_context_token = _sync_test_context_from_provider_data()
    try:
        # Get base URL based on client type
        if client_type == "openai":
            base_url = str(self._client.base_url)
@ -488,6 +582,9 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
        else:
            raise AssertionError(f"Invalid mode: {mode}")
    finally:
        if test_context_token:
            _test_context.reset(test_context_token)
 def patch_inference_clients():
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -124,12 +124,6 @@ echo ""
 echo "Checking llama packages"
 uv pip list | grep llama
 # Check storage and memory before tests
 echo "=== System Resources Before Tests ==="
 free -h 2>/dev/null || echo "free command not available"
 df -h
 echo ""
 # Set environment variables
 export LLAMA_STACK_CLIENT_TIMEOUT=300
@ -144,6 +138,17 @@ echo "=== Applying Setup Environment Variables ==="
 # the server needs this
 export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
 export SQLITE_STORE_DIR=$(mktemp -d)
 echo "Setting SQLITE_STORE_DIR: $SQLITE_STORE_DIR"
 # Determine stack config type for api_recorder test isolation
 if [[ "$STACK_CONFIG" == server:* ]]; then
    export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="server"
    echo "Setting stack config type: server"
 else
    export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="library_client"
    echo "Setting stack config type: library_client"
 fi
 SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash)
 echo "Setting up environment variables:"
@ -186,6 +191,8 @@ if [[ "$STACK_CONFIG" == *"server:"* ]]; then
        echo "Llama Stack Server is already running, skipping start"
    else
        echo "=== Starting Llama Stack Server ==="
        # Set a reasonable log width for better readability in server.log
        export LLAMA_STACK_LOG_WIDTH=120
        nohup llama stack run ci-tests --image-type venv > server.log 2>&1 &
        echo "Waiting for Llama Stack Server to start..."
@ -277,11 +284,5 @@ else
    exit 1
 fi
 # Check storage and memory after tests
 echo ""
 echo "=== System Resources After Tests ==="
 free -h 2>/dev/null || echo "free command not available"
 df -h
 echo ""
 echo "=== Integration Tests Complete ==="
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -35,6 +35,18 @@ def pytest_sessionstart(session):
    if "LLAMA_STACK_TEST_INFERENCE_MODE" not in os.environ:
        os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = "replay"
    stack_config = session.config.getoption("--stack-config", default=None)
    if stack_config and stack_config.startswith("server:"):
        os.environ["LLAMA_STACK_TEST_STACK_CONFIG_TYPE"] = "server"
        logger.info(f"Test stack config type: server (stack_config={stack_config})")
    else:
        os.environ["LLAMA_STACK_TEST_STACK_CONFIG_TYPE"] = "library_client"
        logger.info(f"Test stack config type: library_client (stack_config={stack_config})")
    from llama_stack.testing.inference_recorder import patch_httpx_for_test_id
    patch_httpx_for_test_id()
@pytest.fixture(autouse=True)
 def _track_test_context(request):