feat(ci): add support for docker:distro in tests (#3832)

Also a critical bug fix so test recordings can be found inside docker
2025-10-18 07:18:53 +00:00 · 2025-10-16 19:33:13 -07:00 · 2025-10-16 19:33:13 -07:00 · cd152f4240
commit cd152f4240
parent b3099d40e2
7 changed files with 932 additions and 12 deletions
--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@ -338,7 +338,7 @@ fi
 # Add other require item commands genearic to all containers
 add_to_container << EOF

-RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
+RUN mkdir -p /.llama /.cache && chmod -R g+rw /.llama /.cache && (chmod -R g+rw /app 2>/dev/null || true)
 EOF

 printf "Containerfile created successfully in %s/Containerfile\n\n" "$TEMP_DIR"
--- a/llama_stack/core/testing_context.py
+++ b/llama_stack/core/testing_context.py
@ -42,3 +42,8 @@ def sync_test_context_from_provider_data():
        return TEST_CONTEXT.set(provider_data["__test_id"])

    return None
+
+
+def is_debug_mode() -> bool:
+    """Check if test recording debug mode is enabled via LLAMA_STACK_TEST_DEBUG env var."""
+    return os.environ.get("LLAMA_STACK_TEST_DEBUG", "").lower() in ("1", "true", "yes")
--- a/llama_stack/testing/api_recorder.py
+++ b/llama_stack/testing/api_recorder.py
@ -37,7 +37,7 @@ _id_counters: dict[str, dict[str, int]] = {}
 # Test context uses ContextVar since it changes per-test and needs async isolation
 from openai.types.completion_choice import CompletionChoice

-from llama_stack.core.testing_context import get_test_context
+from llama_stack.core.testing_context import get_test_context, is_debug_mode

 # update the "finish_reason" field, since its type definition is wrong (no None is accepted)
 CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
@ -146,6 +146,7 @@ def normalize_inference_request(method: str, url: str, headers: dict[str, Any],

    body_for_hash = _normalize_body_for_hash(body)

+    test_id = get_test_context()
    normalized: dict[str, Any] = {
        "method": method.upper(),
        "endpoint": parsed.path,
@ -154,10 +155,20 @@ def normalize_inference_request(method: str, url: str, headers: dict[str, Any],

    # Include test_id for isolation, except for shared infrastructure endpoints
    if parsed.path not in ("/api/tags", "/v1/models"):
-        normalized["test_id"] = get_test_context()
+        normalized["test_id"] = test_id

    normalized_json = json.dumps(normalized, sort_keys=True)
-    return hashlib.sha256(normalized_json.encode()).hexdigest()
+    request_hash = hashlib.sha256(normalized_json.encode()).hexdigest()
+
+    if is_debug_mode():
+        logger.info("[RECORDING DEBUG] Hash computation:")
+        logger.info(f"  Test ID: {test_id}")
+        logger.info(f"  Method: {method.upper()}")
+        logger.info(f"  Endpoint: {parsed.path}")
+        logger.info(f"  Model: {body.get('model', 'N/A')}")
+        logger.info(f"  Computed hash: {request_hash}")
+
+    return request_hash


 def normalize_tool_request(provider_name: str, tool_name: str, kwargs: dict[str, Any]) -> str:
@ -212,6 +223,11 @@ def patch_httpx_for_test_id():
            provider_data["__test_id"] = test_id
            request.headers["X-LlamaStack-Provider-Data"] = json.dumps(provider_data)

+            if is_debug_mode():
+                logger.info("[RECORDING DEBUG] Injected test ID into request header:")
+                logger.info(f"  Test ID: {test_id}")
+                logger.info(f"  URL: {request.url}")
+
        return None

    LlamaStackClient._prepare_request = patched_prepare_request
@ -355,12 +371,35 @@ class ResponseStorage:
            test_file = test_id.split("::")[0]  # Remove test function part
            test_dir = Path(test_file).parent  # Get parent directory

-            # Put recordings in a "recordings" subdirectory of the test's parent dir
-            # e.g., "tests/integration/inference" -> "tests/integration/inference/recordings"
-            return test_dir / "recordings"
+            if self.base_dir.is_absolute():
+                repo_root = self.base_dir.parent.parent.parent
+                result = repo_root / test_dir / "recordings"
+                if is_debug_mode():
+                    logger.info("[RECORDING DEBUG] Path resolution (absolute base_dir):")
+                    logger.info(f"  Test ID: {test_id}")
+                    logger.info(f"  Base dir: {self.base_dir}")
+                    logger.info(f"  Repo root: {repo_root}")
+                    logger.info(f"  Test file: {test_file}")
+                    logger.info(f"  Test dir: {test_dir}")
+                    logger.info(f"  Recordings dir: {result}")
+                return result
+            else:
+                result = test_dir / "recordings"
+                if is_debug_mode():
+                    logger.info("[RECORDING DEBUG] Path resolution (relative base_dir):")
+                    logger.info(f"  Test ID: {test_id}")
+                    logger.info(f"  Base dir: {self.base_dir}")
+                    logger.info(f"  Test dir: {test_dir}")
+                    logger.info(f"  Recordings dir: {result}")
+                return result
        else:
            # Fallback for non-test contexts
-            return self.base_dir / "recordings"
+            result = self.base_dir / "recordings"
+            if is_debug_mode():
+                logger.info("[RECORDING DEBUG] Path resolution (no test context):")
+                logger.info(f"  Base dir: {self.base_dir}")
+                logger.info(f"  Recordings dir: {result}")
+            return result

    def _ensure_directory(self):
        """Ensure test-specific directories exist."""
@ -395,6 +434,13 @@ class ResponseStorage:

        response_path = responses_dir / response_file

+        if is_debug_mode():
+            logger.info("[RECORDING DEBUG] Storing recording:")
+            logger.info(f"  Request hash: {request_hash}")
+            logger.info(f"  File: {response_path}")
+            logger.info(f"  Test ID: {get_test_context()}")
+            logger.info(f"  Endpoint: {endpoint}")
+
        # Save response to JSON file with metadata
        with open(response_path, "w") as f:
            json.dump(
@ -423,16 +469,33 @@ class ResponseStorage:
        test_dir = self._get_test_dir()
        response_path = test_dir / response_file

+        if is_debug_mode():
+            logger.info("[RECORDING DEBUG] Looking up recording:")
+            logger.info(f"  Request hash: {request_hash}")
+            logger.info(f"  Primary path: {response_path}")
+            logger.info(f"  Primary exists: {response_path.exists()}")
+
        if response_path.exists():
+            if is_debug_mode():
+                logger.info("  Found in primary location")
            return _recording_from_file(response_path)

        # Fallback to base recordings directory (for session-level recordings)
        fallback_dir = self.base_dir / "recordings"
        fallback_path = fallback_dir / response_file

+        if is_debug_mode():
+            logger.info(f"  Fallback path: {fallback_path}")
+            logger.info(f"  Fallback exists: {fallback_path.exists()}")
+
        if fallback_path.exists():
+            if is_debug_mode():
+                logger.info("  Found in fallback location")
            return _recording_from_file(fallback_path)

+        if is_debug_mode():
+            logger.info("  Recording not found in either location")
+
        return None

    def _model_list_responses(self, request_hash: str) -> list[dict[str, Any]]:
@ -588,6 +651,13 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
    mode = _current_mode
    storage = _current_storage

+    if is_debug_mode():
+        logger.info("[RECORDING DEBUG] Entering inference method:")
+        logger.info(f"  Mode: {mode}")
+        logger.info(f"  Client type: {client_type}")
+        logger.info(f"  Endpoint: {endpoint}")
+        logger.info(f"  Test context: {get_test_context()}")
+
    if mode == APIRecordingMode.LIVE or storage is None:
        if endpoint == "/v1/models":
            return original_method(self, *args, **kwargs)
@ -643,6 +713,18 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
                return response_body
        elif mode == APIRecordingMode.REPLAY:
            # REPLAY mode requires recording to exist
+            if is_debug_mode():
+                logger.error("[RECORDING DEBUG] Recording not found!")
+                logger.error(f"  Mode: {mode}")
+                logger.error(f"  Request hash: {request_hash}")
+                logger.error(f"  Method: {method}")
+                logger.error(f"  URL: {url}")
+                logger.error(f"  Endpoint: {endpoint}")
+                logger.error(f"  Model: {body.get('model', 'unknown')}")
+                logger.error(f"  Test context: {get_test_context()}")
+                logger.error(
+                    f"  Stack config type: {os.environ.get('LLAMA_STACK_TEST_STACK_CONFIG_TYPE', 'library_client')}"
+                )
            raise RuntimeError(
                f"Recording not found for request hash: {request_hash}\n"
                f"Model: {body.get('model', 'unknown')} | Request: {method} {url}\n"
--- a/scripts/diagnose_recordings.py
+++ b/scripts/diagnose_recordings.py
@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Diagnostic tool for debugging test recording issues.
+
+Usage:
+    # Find where a hash would be looked up
+    ./scripts/diagnose_recordings.py find-hash 7526c930eab04ce337496a26cd15f2591d7943035f2527182861643da9b837a7
+
+    # Show what's in a recording file
+    ./scripts/diagnose_recordings.py show tests/integration/agents/recordings/7526c930....json
+
+    # List all recordings for a test
+    ./scripts/diagnose_recordings.py list-test "tests/integration/agents/test_agents.py::test_custom_tool"
+
+    # Explain lookup paths for a test
+    ./scripts/diagnose_recordings.py explain-paths --test-id "tests/integration/agents/test_agents.py::test_foo"
+
+    # Compare request hash computation
+    ./scripts/diagnose_recordings.py compute-hash --endpoint /v1/chat/completions --method POST --body '{"model":"llama3.2:3b"}' --test-id "..."
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# Add parent directory to path to import from llama_stack
+REPO_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+
+try:
+    from llama_stack.testing.api_recorder import normalize_inference_request
+except ImportError:
+    normalize_inference_request = None
+
+
+def find_hash(hash_value: str, base_dir: Path | None = None, test_id: str | None = None):
+    """Find where a hash would be looked up and what exists"""
+    if base_dir is None:
+        base_dir = REPO_ROOT / "tests/integration/common"
+
+    print(f"Searching for hash: {hash_value}\n")
+    print(f"Base dir: {base_dir} (absolute={base_dir.is_absolute()})")
+
+    # Compute test directory
+    if test_id:
+        test_file = test_id.split("::")[0]
+        test_dir = Path(test_file).parent
+
+        if base_dir.is_absolute():
+            repo_root = base_dir.parent.parent.parent
+            test_recordings_dir = repo_root / test_dir / "recordings"
+        else:
+            test_recordings_dir = test_dir / "recordings"
+        print(f"Test ID: {test_id}")
+        print(f"Test dir: {test_recordings_dir}\n")
+    else:
+        test_recordings_dir = base_dir / "recordings"
+        print("No test ID provided, using base dir\n")
+
+    # Check primary location
+    response_file = f"{hash_value}.json"
+    response_path = test_recordings_dir / response_file
+
+    print("Checking primary location:")
+    print(f"  {response_path}")
+    if response_path.exists():
+        print("  EXISTS")
+        print("\nFound! Contents:")
+        show_recording(response_path)
+        return True
+    else:
+        print("  Does not exist")
+
+    # Check fallback location
+    fallback_dir = base_dir / "recordings"
+    fallback_path = fallback_dir / response_file
+
+    print("\nChecking fallback location:")
+    print(f"  {fallback_path}")
+    if fallback_path.exists():
+        print("  EXISTS")
+        print("\nFound in fallback! Contents:")
+        show_recording(fallback_path)
+        return True
+    else:
+        print("  Does not exist")
+
+    # Show what files DO exist
+    print(f"\nFiles in test directory ({test_recordings_dir}):")
+    if test_recordings_dir.exists():
+        json_files = list(test_recordings_dir.glob("*.json"))
+        if json_files:
+            for f in json_files[:20]:
+                print(f"  - {f.name}")
+            if len(json_files) > 20:
+                print(f"  ... and {len(json_files) - 20} more")
+        else:
+            print("  (empty)")
+    else:
+        print("  Directory does not exist")
+
+    print(f"\nFiles in fallback directory ({fallback_dir}):")
+    if fallback_dir.exists():
+        json_files = list(fallback_dir.glob("*.json"))
+        if json_files:
+            for f in json_files[:20]:
+                print(f"  - {f.name}")
+            if len(json_files) > 20:
+                print(f"  ... and {len(json_files) - 20} more")
+        else:
+            print("  (empty)")
+    else:
+        print("  Directory does not exist")
+
+    # Try partial hash match
+    print("\nLooking for partial matches (first 16 chars)...")
+    partial = hash_value[:16]
+    matches = []
+
+    for dir_to_search in [test_recordings_dir, fallback_dir]:
+        if dir_to_search.exists():
+            for f in dir_to_search.glob("*.json"):
+                if f.stem.startswith(partial):
+                    matches.append(f)
+
+    if matches:
+        print(f"Found {len(matches)} partial match(es):")
+        for m in matches:
+            print(f"  {m}")
+    else:
+        print("No partial matches found")
+
+    return False
+
+
+def show_recording(file_path: Path):
+    """Show contents of a recording file"""
+    if not file_path.exists():
+        print(f"File does not exist: {file_path}")
+        return
+
+    with open(file_path) as f:
+        data = json.load(f)
+
+    print(f"\nRecording: {file_path.name}\n")
+    print(f"Test ID: {data.get('test_id', 'N/A')}")
+    print("\nRequest:")
+    req = data.get("request", {})
+    print(f"  Method: {req.get('method', 'N/A')}")
+    print(f"  URL: {req.get('url', 'N/A')}")
+    print(f"  Endpoint: {req.get('endpoint', 'N/A')}")
+    print(f"  Model: {req.get('model', 'N/A')}")
+
+    body = req.get("body", {})
+    if body:
+        print("\nRequest Body:")
+        print(f"  Model: {body.get('model', 'N/A')}")
+        print(f"  Stream: {body.get('stream', 'N/A')}")
+        if "messages" in body:
+            print(f"  Messages: {len(body['messages'])} message(s)")
+            for i, msg in enumerate(body["messages"][:3]):
+                role = msg.get("role", "unknown")
+                content = msg.get("content", "")
+                if isinstance(content, str):
+                    preview = content[:80] + "..." if len(content) > 80 else content
+                else:
+                    preview = f"[{type(content).__name__}]"
+                print(f"    [{i}] {role}: {preview}")
+        if "tools" in body:
+            print(f"  Tools: {len(body['tools'])} tool(s)")
+
+    response = data.get("response", {})
+    if response:
+        print("\nResponse:")
+        print(f"  Is streaming: {response.get('is_streaming', False)}")
+        response_body = response.get("body", {})
+        if isinstance(response_body, dict):
+            if "__type__" in response_body:
+                print(f"  Type: {response_body['__type__']}")
+            if "__data__" in response_body:
+                response_data = response_body["__data__"]
+                if "choices" in response_data:
+                    print(f"  Choices: {len(response_data['choices'])}")
+                if "usage" in response_data:
+                    usage = response_data["usage"]
+                    print(f"  Usage: in={usage.get('input_tokens')}, out={usage.get('output_tokens')}")
+
+
+def list_test_recordings(test_id: str, base_dir: Path | None = None):
+    """List all recordings for a specific test"""
+    if base_dir is None:
+        base_dir = REPO_ROOT / "tests/integration/common"
+
+    test_file = test_id.split("::")[0]
+    test_dir = Path(test_file).parent
+
+    if base_dir.is_absolute():
+        repo_root = base_dir.parent.parent.parent
+        test_recordings_dir = repo_root / test_dir / "recordings"
+    else:
+        test_recordings_dir = test_dir / "recordings"
+
+    print(f"Recordings for test: {test_id}\n")
+    print(f"Directory: {test_recordings_dir}\n")
+
+    if not test_recordings_dir.exists():
+        print("Directory does not exist")
+        return
+
+    # Find all recordings for this specific test
+    recordings = []
+    for f in test_recordings_dir.glob("*.json"):
+        try:
+            with open(f) as fp:
+                data = json.load(fp)
+                if data.get("test_id") == test_id:
+                    recordings.append((f, data))
+        except Exception as e:
+            print(f"Could not read {f.name}: {e}")
+
+    if not recordings:
+        print("No recordings found for this exact test ID")
+        print("\nAll files in directory:")
+        for f in test_recordings_dir.glob("*.json"):
+            print(f"  - {f.name}")
+        return
+
+    print(f"Found {len(recordings)} recording(s):\n")
+    for f, data in recordings:
+        req = data.get("request", {})
+        print(f"  {f.name}")
+        print(f"     Endpoint: {req.get('endpoint', 'N/A')}")
+        print(f"     Model: {req.get('model', 'N/A')}")
+        print("")
+
+
+def explain_paths(test_id: str | None = None, base_dir: Path | None = None):
+    """Explain where recordings would be searched"""
+    if base_dir is None:
+        base_dir = REPO_ROOT / "tests/integration/common"
+
+    print("Recording Lookup Path Explanation\n")
+    print(f"Base directory: {base_dir}")
+    print(f"  Absolute: {base_dir.is_absolute()}")
+    print("")
+
+    if test_id:
+        print(f"Test ID: {test_id}")
+        test_file = test_id.split("::")[0]
+        print(f"  Test file: {test_file}")
+
+        test_dir = Path(test_file).parent
+        print(f"  Test dir (relative): {test_dir}")
+
+        if base_dir.is_absolute():
+            repo_root = base_dir.parent.parent.parent
+            print(f"  Repo root: {repo_root}")
+            test_recordings_dir = repo_root / test_dir / "recordings"
+            print(f"  Test recordings dir (absolute): {test_recordings_dir}")
+        else:
+            test_recordings_dir = test_dir / "recordings"
+            print(f"  Test recordings dir (relative): {test_recordings_dir}")
+
+        print("\nLookup order for recordings:")
+        print(f"  1. Test-specific: {test_recordings_dir}/<hash>.json")
+        print(f"  2. Fallback: {base_dir}/recordings/<hash>.json")
+
+    else:
+        print("No test ID provided")
+        print("\nLookup location:")
+        print(f"  {base_dir}/recordings/<hash>.json")
+
+
+def compute_hash(endpoint: str, method: str, body_json: str, test_id: str | None = None):
+    """Compute hash for a request"""
+    if normalize_inference_request is None:
+        print("Could not import normalize_inference_request from llama_stack.testing.api_recorder")
+        print("Make sure you're running from the repo root with proper PYTHONPATH")
+        return
+
+    try:
+        body = json.loads(body_json)
+    except json.JSONDecodeError as e:
+        print(f"Invalid JSON in body: {e}")
+        return
+
+    # Create a fake URL with the endpoint
+    url = f"http://example.com{endpoint}"
+
+    # Set test context if provided
+    if test_id:
+        from llama_stack.core.testing_context import set_test_context
+
+        set_test_context(test_id)
+
+    hash_result = normalize_inference_request(method, url, {}, body)
+
+    print("Hash Computation\n")
+    print(f"Method: {method}")
+    print(f"Endpoint: {endpoint}")
+    print(f"Test ID: {test_id or 'None (excluded from hash for model-list endpoints)'}")
+    print("\nBody:")
+    print(json.dumps(body, indent=2))
+    print(f"\nComputed Hash: {hash_result}")
+    print(f"\nLooking for file: {hash_result}.json")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Diagnostic tool for test recording issues",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    subparsers = parser.add_subparsers(dest="command", help="Command to run")
+
+    # find-hash command
+    find_parser = subparsers.add_parser("find-hash", help="Find where a hash would be looked up")
+    find_parser.add_argument("hash", help="Hash value to search for (full or partial)")
+    find_parser.add_argument("--test-id", help="Test ID to determine search paths")
+    find_parser.add_argument("--base-dir", type=Path, help="Base directory (default: tests/integration/common)")
+
+    # show command
+    show_parser = subparsers.add_parser("show", help="Show contents of a recording file")
+    show_parser.add_argument("file", type=Path, help="Path to recording JSON file")
+
+    # list-test command
+    list_parser = subparsers.add_parser("list-test", help="List all recordings for a test")
+    list_parser.add_argument("test_id", help="Full test ID (e.g., tests/integration/agents/test_agents.py::test_foo)")
+    list_parser.add_argument("--base-dir", type=Path, help="Base directory (default: tests/integration/common)")
+
+    # explain-paths command
+    explain_parser = subparsers.add_parser("explain-paths", help="Explain where recordings are searched")
+    explain_parser.add_argument("--test-id", help="Test ID to show paths for")
+    explain_parser.add_argument("--base-dir", type=Path, help="Base directory (default: tests/integration/common)")
+
+    # compute-hash command
+    hash_parser = subparsers.add_parser("compute-hash", help="Compute hash for a request")
+    hash_parser.add_argument("--endpoint", required=True, help="Endpoint path (e.g., /v1/chat/completions)")
+    hash_parser.add_argument("--method", default="POST", help="HTTP method (default: POST)")
+    hash_parser.add_argument("--body", required=True, help="Request body as JSON string")
+    hash_parser.add_argument("--test-id", help="Test ID (affects hash for non-model-list endpoints)")
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        return
+
+    if args.command == "find-hash":
+        find_hash(args.hash, args.base_dir, args.test_id)
+    elif args.command == "show":
+        show_recording(args.file)
+    elif args.command == "list-test":
+        list_test_recordings(args.test_id, args.base_dir)
+    elif args.command == "explain-paths":
+        explain_paths(args.test_id, args.base_dir)
+    elif args.command == "compute-hash":
+        compute_hash(args.endpoint, args.method, args.body, args.test_id)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/docker.sh
+++ b/scripts/docker.sh
@ -0,0 +1,358 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+# Docker container management script for Llama Stack
+# Allows starting/stopping/restarting a Llama Stack docker container for testing
+
+# Default values
+DISTRO=""
+PORT=8321
+INFERENCE_MODE="replay"
+COMMAND=""
+USE_COPY_NOT_MOUNT=false
+NO_REBUILD=false
+
+# Function to display usage
+usage() {
+    cat <<EOF
+Usage: $0 COMMAND [OPTIONS]
+
+Commands:
+    start       Build and start the docker container
+    stop        Stop and remove the docker container
+    restart     Restart the docker container
+    status      Check if the container is running
+    logs        Show container logs (add -f to follow)
+
+Options:
+    --distro STRING          Distribution name (e.g., 'ci-tests', 'starter') (required for start/restart)
+    --port NUMBER            Port to run on (default: 8321)
+    --inference-mode STRING  Inference mode: replay, record-if-missing or record (default: replay)
+    --copy-source            Copy source into image instead of mounting (default: auto-detect CI, otherwise mount)
+    --no-rebuild             Skip building the image, just start the container (default: false)
+    --help                   Show this help message
+
+Examples:
+    # Start a docker container (local dev mode - mounts source, builds image)
+    $0 start --distro ci-tests
+
+    # Start without rebuilding (uses existing image)
+    $0 start --distro ci-tests --no-rebuild
+
+    # Start with source copied into image (like CI)
+    $0 start --distro ci-tests --copy-source
+
+    # Start with custom port
+    $0 start --distro starter --port 8080
+
+    # Check status
+    $0 status --distro ci-tests
+
+    # View logs
+    $0 logs --distro ci-tests
+
+    # Stop container
+    $0 stop --distro ci-tests
+
+    # Restart container
+    $0 restart --distro ci-tests
+
+Note: In CI environments (detected via CI or GITHUB_ACTIONS env vars), source is
+      automatically copied into the image. Locally, source is mounted for live development
+      unless --copy-source is specified.
+EOF
+}
+
+# Parse command (first positional arg)
+if [[ $# -eq 0 ]]; then
+    echo "Error: Command required"
+    usage
+    exit 1
+fi
+
+COMMAND="$1"
+shift
+
+# Validate command
+case "$COMMAND" in
+start | stop | restart | status | logs) ;;
+--help)
+    usage
+    exit 0
+    ;;
+*)
+    echo "Error: Unknown command: $COMMAND"
+    usage
+    exit 1
+    ;;
+esac
+
+# Parse options
+while [[ $# -gt 0 ]]; do
+    case $1 in
+    --distro)
+        DISTRO="$2"
+        shift 2
+        ;;
+    --port)
+        PORT="$2"
+        shift 2
+        ;;
+    --inference-mode)
+        INFERENCE_MODE="$2"
+        shift 2
+        ;;
+    --copy-source)
+        USE_COPY_NOT_MOUNT=true
+        shift
+        ;;
+    --no-rebuild)
+        NO_REBUILD=true
+        shift
+        ;;
+    --help)
+        usage
+        exit 0
+        ;;
+    *)
+        echo "Unknown option: $1"
+        usage
+        exit 1
+        ;;
+    esac
+done
+
+# Validate required parameters for commands that need them
+if [[ "$COMMAND" != "stop" && "$COMMAND" != "status" && "$COMMAND" != "logs" ]]; then
+    if [[ -z "$DISTRO" ]]; then
+        echo "Error: --distro is required for '$COMMAND' command"
+        usage
+        exit 1
+    fi
+fi
+
+# If distro not provided for stop/status/logs, try to infer from running containers
+if [[ -z "$DISTRO" && ("$COMMAND" == "stop" || "$COMMAND" == "status" || "$COMMAND" == "logs") ]]; then
+    # Look for any llama-stack-test-* container
+    RUNNING_CONTAINERS=$(docker ps -a --filter "name=llama-stack-test-" --format "{{.Names}}" | head -1)
+    if [[ -n "$RUNNING_CONTAINERS" ]]; then
+        DISTRO=$(echo "$RUNNING_CONTAINERS" | sed 's/llama-stack-test-//')
+        echo "Found running container for distro: $DISTRO"
+    else
+        echo "Error: --distro is required (no running containers found)"
+        usage
+        exit 1
+    fi
+fi
+
+# Remove docker: prefix if present
+DISTRO=$(echo "$DISTRO" | sed 's/^docker://')
+
+CONTAINER_NAME="llama-stack-test-$DISTRO"
+
+# Function to check if container is running
+is_container_running() {
+    docker ps --filter "name=^${CONTAINER_NAME}$" --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"
+}
+
+# Function to check if container exists (running or stopped)
+container_exists() {
+    docker ps -a --filter "name=^${CONTAINER_NAME}$" --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"
+}
+
+# Function to stop and remove container
+stop_container() {
+    if container_exists; then
+        echo "Stopping container: $CONTAINER_NAME"
+        docker stop "$CONTAINER_NAME" 2>/dev/null || true
+        echo "Removing container: $CONTAINER_NAME"
+        docker rm "$CONTAINER_NAME" 2>/dev/null || true
+        echo "✅ Container stopped and removed"
+    else
+        echo "⚠️  Container $CONTAINER_NAME does not exist"
+    fi
+}
+
+# Function to build docker image
+build_image() {
+    echo "=== Building Docker Image for distribution: $DISTRO ==="
+    # Get the repo root (parent of scripts directory)
+    SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+    REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd)
+
+    # Determine whether to copy or mount source
+    # Copy in CI or if explicitly requested, otherwise mount for live development
+    BUILD_ENV="LLAMA_STACK_DIR=$REPO_ROOT"
+    if [[ "$USE_COPY_NOT_MOUNT" == "true" ]] || [[ "${CI:-false}" == "true" ]] || [[ "${GITHUB_ACTIONS:-false}" == "true" ]]; then
+        echo "Copying source into image (USE_COPY_NOT_MOUNT=true, CI=${CI:-false}, GITHUB_ACTIONS=${GITHUB_ACTIONS:-false})"
+        BUILD_ENV="USE_COPY_NOT_MOUNT=true $BUILD_ENV"
+    else
+        echo "Will mount source for live development"
+    fi
+
+    if ! eval "$BUILD_ENV llama stack build --distro '$DISTRO' --image-type container"; then
+        echo "❌ Failed to build Docker image"
+        exit 1
+    fi
+    echo "✅ Docker image built successfully"
+}
+
+# Function to start container
+start_container() {
+    # Check if already running
+    if is_container_running; then
+        echo "⚠️  Container $CONTAINER_NAME is already running"
+        echo "URL: http://localhost:$PORT"
+        exit 0
+    fi
+
+    # Stop and remove if exists but not running
+    if container_exists; then
+        echo "Removing existing stopped container..."
+        docker rm "$CONTAINER_NAME" 2>/dev/null || true
+    fi
+
+    # Build the image unless --no-rebuild was specified
+    if [[ "$NO_REBUILD" == "true" ]]; then
+        echo "Skipping build (--no-rebuild specified)"
+        # Check if image exists (with or without localhost/ prefix)
+        if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "distribution-$DISTRO:dev$"; then
+            echo "❌ Error: Image distribution-$DISTRO:dev does not exist"
+            echo "Either build it first without --no-rebuild, or run: llama stack build --distro $DISTRO --image-type container"
+            exit 1
+        fi
+        echo "✅ Found existing image for distribution-$DISTRO:dev"
+    else
+        build_image
+    fi
+
+    echo ""
+    echo "=== Starting Docker Container ==="
+
+    # Get the repo root for volume mount
+    SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+    REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd)
+
+    # Determine the actual image name (may have localhost/ prefix)
+    IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1)
+    if [[ -z "$IMAGE_NAME" ]]; then
+        echo "❌ Error: Could not find image for distribution-$DISTRO:dev"
+        exit 1
+    fi
+    echo "Using image: $IMAGE_NAME"
+
+    # Build environment variables for docker run
+    DOCKER_ENV_VARS=""
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
+
+    # Set default OLLAMA_URL if not provided
+    # On macOS/Windows, use host.docker.internal to reach host from container
+    # On Linux with --network host, use localhost
+    if [[ "$(uname)" == "Darwin" ]] || [[ "$(uname)" == *"MINGW"* ]]; then
+        OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434}"
+    else
+        OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
+    fi
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
+
+    # Pass through API keys if they exist
+    [ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY"
+    [ -n "${FIREWORKS_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e FIREWORKS_API_KEY=$FIREWORKS_API_KEY"
+    [ -n "${TAVILY_SEARCH_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TAVILY_SEARCH_API_KEY=$TAVILY_SEARCH_API_KEY"
+    [ -n "${OPENAI_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OPENAI_API_KEY=$OPENAI_API_KEY"
+    [ -n "${ANTHROPIC_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY"
+    [ -n "${GROQ_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e GROQ_API_KEY=$GROQ_API_KEY"
+    [ -n "${GEMINI_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e GEMINI_API_KEY=$GEMINI_API_KEY"
+    [ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL"
+    [ -n "${SQLITE_STORE_DIR:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SQLITE_STORE_DIR=$SQLITE_STORE_DIR"
+
+    # Use --network host on Linux only (macOS doesn't support it properly)
+    NETWORK_MODE=""
+    if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
+        NETWORK_MODE="--network host"
+    fi
+
+    docker run -d $NETWORK_MODE --name "$CONTAINER_NAME" \
+        -p $PORT:$PORT \
+        $DOCKER_ENV_VARS \
+        -v "$REPO_ROOT":/app/llama-stack-source \
+        "$IMAGE_NAME" \
+        --port $PORT
+
+    echo "Waiting for container to start..."
+    for i in {1..30}; do
+        if curl -s http://localhost:$PORT/v1/health 2>/dev/null | grep -q "OK"; then
+            echo "✅ Container started successfully"
+            echo ""
+            echo "=== Container Information ==="
+            echo "Container name: $CONTAINER_NAME"
+            echo "URL: http://localhost:$PORT"
+            echo "Health check: http://localhost:$PORT/v1/health"
+            echo ""
+            echo "To view logs: $0 logs --distro $DISTRO"
+            echo "To stop: $0 stop --distro $DISTRO"
+            return 0
+        fi
+        if [[ $i -eq 30 ]]; then
+            echo "❌ Container failed to start within timeout"
+            echo "Showing container logs:"
+            docker logs "$CONTAINER_NAME"
+            exit 1
+        fi
+        sleep 1
+    done
+}
+
+# Execute command
+case "$COMMAND" in
+start)
+    start_container
+    ;;
+stop)
+    stop_container
+    ;;
+restart)
+    echo "Restarting container: $CONTAINER_NAME"
+    stop_container
+    echo ""
+    start_container
+    ;;
+status)
+    if is_container_running; then
+        echo "✅ Container $CONTAINER_NAME is running"
+        echo "URL: http://localhost:$PORT"
+        # Try to get the actual port from the container
+        ACTUAL_PORT=$(docker port "$CONTAINER_NAME" 2>/dev/null | grep "8321/tcp" | cut -d':' -f2 | head -1)
+        if [[ -n "$ACTUAL_PORT" ]]; then
+            echo "Port: $ACTUAL_PORT"
+        fi
+    elif container_exists; then
+        echo "⚠️  Container $CONTAINER_NAME exists but is not running"
+        echo "Start it with: $0 start --distro $DISTRO"
+    else
+        echo "❌ Container $CONTAINER_NAME does not exist"
+        echo "Start it with: $0 start --distro $DISTRO"
+    fi
+    ;;
+logs)
+    if container_exists; then
+        echo "=== Logs for $CONTAINER_NAME ==="
+        # Check if -f flag was passed after 'logs' command
+        if [[ "${1:-}" == "-f" || "${1:-}" == "--follow" ]]; then
+            docker logs --tail 100 --follow "$CONTAINER_NAME"
+        else
+            docker logs --tail 100 "$CONTAINER_NAME"
+        fi
+    else
+        echo "❌ Container $CONTAINER_NAME does not exist"
+        exit 1
+    fi
+    ;;
+esac
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -42,9 +42,12 @@ Setups are defined in tests/integration/setups.py and provide global configurati
 You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.

 Examples:
-    # Basic inference tests with ollama
+    # Basic inference tests with ollama (server mode)
    $0 --stack-config server:ci-tests --suite base --setup ollama

+    # Basic inference tests with docker
+    $0 --stack-config docker:ci-tests --suite base --setup ollama
+
    # Multiple test directories with vllm
    $0 --stack-config server:ci-tests --subdirs 'inference,agents' --setup vllm

@ -153,7 +156,7 @@ echo "Setting SQLITE_STORE_DIR: $SQLITE_STORE_DIR"

 # Determine stack config type for api_recorder test isolation
 if [[ "$COLLECT_ONLY" == false ]]; then
-    if [[ "$STACK_CONFIG" == server:* ]]; then
+    if [[ "$STACK_CONFIG" == server:* ]] || [[ "$STACK_CONFIG" == docker:* ]]; then
        export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="server"
        echo "Setting stack config type: server"
    else
@ -229,6 +232,104 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
    trap stop_server EXIT ERR INT TERM
 fi

+# Start Docker Container if needed
+if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
+    stop_container() {
+        echo "Stopping Docker container..."
+        container_name="llama-stack-test-$DISTRO"
+        if docker ps -a --format '{{.Names}}' | grep -q "^${container_name}$"; then
+            echo "Stopping and removing container: $container_name"
+            docker stop "$container_name" 2>/dev/null || true
+            docker rm "$container_name" 2>/dev/null || true
+        else
+            echo "No container named $container_name found"
+        fi
+        echo "Docker container stopped"
+    }
+
+    # Extract distribution name from docker:distro format
+    DISTRO=$(echo "$STACK_CONFIG" | sed 's/^docker://')
+    export LLAMA_STACK_PORT=8321
+
+    echo "=== Building Docker Image for distribution: $DISTRO ==="
+    # Set LLAMA_STACK_DIR to repo root
+    # USE_COPY_NOT_MOUNT copies files into image (for CI), otherwise mounts for live development
+    BUILD_ENV="LLAMA_STACK_DIR=$ROOT_DIR"
+    if [[ "${CI:-false}" == "true" ]] || [[ "${GITHUB_ACTIONS:-false}" == "true" ]]; then
+        echo "CI detected (CI=$CI, GITHUB_ACTIONS=$GITHUB_ACTIONS): copying source into image"
+        BUILD_ENV="USE_COPY_NOT_MOUNT=true $BUILD_ENV"
+    else
+        echo "Local mode: will mount source for live development"
+    fi
+
+    eval "$BUILD_ENV llama stack build --distro '$DISTRO' --image-type container"
+
+    if [ $? -ne 0 ]; then
+        echo "❌ Failed to build Docker image"
+        exit 1
+    fi
+
+    echo ""
+    echo "=== Starting Docker Container ==="
+    container_name="llama-stack-test-$DISTRO"
+
+    # Stop and remove existing container if it exists
+    docker stop "$container_name" 2>/dev/null || true
+    docker rm "$container_name" 2>/dev/null || true
+
+    # Build environment variables for docker run
+    DOCKER_ENV_VARS=""
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
+
+    # Pass through API keys if they exist
+    [ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY"
+    [ -n "${FIREWORKS_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e FIREWORKS_API_KEY=$FIREWORKS_API_KEY"
+    [ -n "${TAVILY_SEARCH_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TAVILY_SEARCH_API_KEY=$TAVILY_SEARCH_API_KEY"
+    [ -n "${OPENAI_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OPENAI_API_KEY=$OPENAI_API_KEY"
+    [ -n "${ANTHROPIC_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY"
+    [ -n "${GROQ_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e GROQ_API_KEY=$GROQ_API_KEY"
+    [ -n "${GEMINI_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e GEMINI_API_KEY=$GEMINI_API_KEY"
+    [ -n "${OLLAMA_URL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
+    [ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL"
+
+    # Determine the actual image name (may have localhost/ prefix)
+    IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1)
+    if [[ -z "$IMAGE_NAME" ]]; then
+        echo "❌ Error: Could not find image for distribution-$DISTRO:dev"
+        exit 1
+    fi
+    echo "Using image: $IMAGE_NAME"
+
+    docker run -d --network host --name "$container_name" \
+        -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+        $DOCKER_ENV_VARS \
+        -v $ROOT_DIR:/app/llama-stack-source \
+        "$IMAGE_NAME" \
+        --port $LLAMA_STACK_PORT
+
+    echo "Waiting for Docker container to start..."
+    for i in {1..30}; do
+        if curl -s http://localhost:$LLAMA_STACK_PORT/v1/health 2>/dev/null | grep -q "OK"; then
+            echo "✅ Docker container started successfully"
+            break
+        fi
+        if [[ $i -eq 30 ]]; then
+            echo "❌ Docker container failed to start"
+            echo "Container logs:"
+            docker logs "$container_name"
+            exit 1
+        fi
+        sleep 1
+    done
+    echo ""
+
+    # Update STACK_CONFIG to point to the running container
+    STACK_CONFIG="http://localhost:$LLAMA_STACK_PORT"
+
+    trap stop_container EXIT ERR INT TERM
+fi
+
 # Run tests
 echo "=== Running Integration Tests ==="
 EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -42,7 +42,9 @@ def pytest_sessionstart(session):

    # Set test stack config type for api_recorder test isolation
    stack_config = session.config.getoption("--stack-config", default=None)
-    if stack_config and (stack_config.startswith("server:") or stack_config.startswith("http")):
+    if stack_config and (
+        stack_config.startswith("server:") or stack_config.startswith("docker:") or stack_config.startswith("http")
+    ):
        os.environ["LLAMA_STACK_TEST_STACK_CONFIG_TYPE"] = "server"
        logger.info(f"Test stack config type: server (stack_config={stack_config})")
    else:
@ -139,7 +141,9 @@ def pytest_addoption(parser):
            a 'pointer' to the stack. this can be either be:
            (a) a template name like `starter`, or
            (b) a path to a run.yaml file, or
-            (c) an adhoc config spec, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`
+            (c) an adhoc config spec, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`, or
+            (d) a server config like `server:ci-tests`, or
+            (e) a docker config like `docker:ci-tests` (builds and runs container)
            """
        ),
    )