From dae1fcd3c2bb0b440f29c2f7291a5d47c81b4051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 3 Jul 2025 19:51:46 +0200 Subject: [PATCH] ci: let pytest run the distro server (#2586) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? * Use #2580 functionality to auto-start the server with the tests * Reduce timeout to 30sec * Print server logs on errors * Pytest logs are collected to a file pytest.log Signed-off-by: Sébastien Han --- .github/workflows/integration-tests.yml | 41 +++--------------- tests/integration/fixtures/common.py | 57 ++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 42 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 32e221128..0dc7a9889 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -25,7 +25,7 @@ jobs: # Listing tests manually since some of them currently fail # TODO: generate matrix list from tests/integration when fixed test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime, vector_io] - client-type: [library, http] + client-type: [library, server] python-version: ["3.12", "3.13"] fail-fast: false # we want to run all tests regardless of failure @@ -45,39 +45,6 @@ jobs: run: | uv run llama stack build --template ollama --image-type venv - - name: Start Llama Stack server in background - if: matrix.client-type == 'http' - env: - INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" - run: | - LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" & - - - name: Wait for Llama Stack server to be ready - if: matrix.client-type == 'http' - run: | - echo "Waiting for Llama Stack server..." - for i in {1..30}; do - if curl -s http://localhost:8321/v1/health | grep -q "OK"; then - echo "Llama Stack server is up!" - exit 0 - fi - sleep 1 - done - echo "Llama Stack server failed to start" - cat server.log - exit 1 - - - name: Verify Ollama status is OK - if: matrix.client-type == 'http' - run: | - echo "Verifying Ollama status..." - ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status) - echo "Ollama status: $ollama_status" - if [ "$ollama_status" != "OK" ]; then - echo "Ollama health check failed" - exit 1 - fi - - name: Check Storage and Memory Available Before Tests if: ${{ always() }} run: | @@ -92,12 +59,14 @@ jobs: if [ "${{ matrix.client-type }}" == "library" ]; then stack_config="ollama" else - stack_config="http://localhost:8321" + stack_config="server:ollama" fi uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ --text-model="meta-llama/Llama-3.2-3B-Instruct" \ - --embedding-model=all-MiniLM-L6-v2 + --embedding-model=all-MiniLM-L6-v2 \ + --color=yes \ + --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log - name: Check Storage and Memory Available After Tests if: ${{ always() }} diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py index 2d6092e44..ecd29484b 100644 --- a/tests/integration/fixtures/common.py +++ b/tests/integration/fixtures/common.py @@ -37,26 +37,42 @@ def is_port_available(port: int, host: str = "localhost") -> bool: def start_llama_stack_server(config_name: str) -> subprocess.Popen: """Start a llama stack server with the given config.""" cmd = ["llama", "stack", "run", config_name] - - # Start server in background - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + devnull = open(os.devnull, "w") + process = subprocess.Popen( + cmd, + stdout=devnull, # redirect stdout to devnull to prevent deadlock + stderr=devnull, # redirect stderr to devnull to prevent deadlock + text=True, + env={**os.environ, "LLAMA_STACK_LOG_FILE": "server.log"}, + ) return process -def wait_for_server_ready(base_url: str, timeout: int = 120) -> bool: +def wait_for_server_ready(base_url: str, timeout: int = 30, process: subprocess.Popen | None = None) -> bool: """Wait for the server to be ready by polling the health endpoint.""" health_url = f"{base_url}/v1/health" start_time = time.time() while time.time() - start_time < timeout: + if process and process.poll() is not None: + print(f"Server process terminated with return code: {process.returncode}") + return False + try: response = requests.get(health_url, timeout=5) if response.status_code == 200: return True except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): pass + + # Print progress every 5 seconds + elapsed = time.time() - start_time + if int(elapsed) % 5 == 0 and elapsed > 0: + print(f"Waiting for server at {base_url}... ({elapsed:.1f}s elapsed)") + time.sleep(0.5) + print(f"Server failed to respond within {timeout} seconds") return False @@ -179,11 +195,12 @@ def llama_stack_client(request, provider_data): server_process = start_llama_stack_server(config_name) # Wait for server to be ready - if not wait_for_server_ready(base_url, timeout=120): + if not wait_for_server_ready(base_url, timeout=30, process=server_process): print("Server failed to start within timeout") server_process.terminate() raise RuntimeError( - f"Server failed to start within timeout. Check that config '{config_name}' exists and is valid." + f"Server failed to start within timeout. Check that config '{config_name}' exists and is valid. " + f"See server.log for details." ) print(f"Server is ready at {base_url}") @@ -227,3 +244,31 @@ def llama_stack_client(request, provider_data): def openai_client(client_with_models): base_url = f"{client_with_models.base_url}/v1/openai/v1" return OpenAI(base_url=base_url, api_key="fake") + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_server_process(request): + """Cleanup server process at the end of the test session.""" + yield # Run tests + + if hasattr(request.session, "_llama_stack_server_process"): + server_process = request.session._llama_stack_server_process + if server_process: + if server_process.poll() is None: + print("Terminating llama stack server process...") + else: + print(f"Server process already terminated with return code: {server_process.returncode}") + return + try: + server_process.terminate() + server_process.wait(timeout=10) + print("Server process terminated gracefully") + except subprocess.TimeoutExpired: + print("Server process did not terminate gracefully, killing it") + server_process.kill() + server_process.wait() + print("Server process killed") + except Exception as e: + print(f"Error during server cleanup: {e}") + else: + print("Server process not found - won't be able to cleanup")