From dae1fcd3c2bb0b440f29c2f7291a5d47c81b4051 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 3 Jul 2025 19:51:46 +0200
Subject: [PATCH] ci: let pytest run the distro server (#2586)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

* Use #2580 functionality to auto-start the server with the tests
* Reduce timeout to 30sec
* Print server logs on errors
* Pytest logs are collected to a file pytest.log

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/integration-tests.yml | 41 +++---------------
 tests/integration/fixtures/common.py    | 57 ++++++++++++++++++++++---
 2 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 32e221128..0dc7a9889 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -25,7 +25,7 @@ jobs:
         # Listing tests manually since some of them currently fail
         # TODO: generate matrix list from tests/integration when fixed
         test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime, vector_io]
-        client-type: [library, http]
+        client-type: [library, server]
         python-version: ["3.12", "3.13"]
       fail-fast: false # we want to run all tests regardless of failure
 
@@ -45,39 +45,6 @@ jobs:
         run: |
           uv run llama stack build --template ollama --image-type venv
 
-      - name: Start Llama Stack server in background
-        if: matrix.client-type == 'http'
-        env:
-          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
-        run: |
-          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
-
-      - name: Wait for Llama Stack server to be ready
-        if: matrix.client-type == 'http'
-        run: |
-          echo "Waiting for Llama Stack server..."
-          for i in {1..30}; do
-            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
-              echo "Llama Stack server is up!"
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "Llama Stack server failed to start"
-          cat server.log
-          exit 1
-
-      - name: Verify Ollama status is OK
-        if: matrix.client-type == 'http'
-        run: |
-          echo "Verifying Ollama status..."
-          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
-          echo "Ollama status: $ollama_status"
-          if [ "$ollama_status" != "OK" ]; then
-            echo "Ollama health check failed"
-            exit 1
-          fi
-
       - name: Check Storage and Memory Available Before Tests
         if: ${{ always() }}
         run: |
@@ -92,12 +59,14 @@ jobs:
           if [ "${{ matrix.client-type }}" == "library" ]; then
             stack_config="ollama"
           else
-            stack_config="http://localhost:8321"
+            stack_config="server:ollama"
           fi
           uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
             -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
             --text-model="meta-llama/Llama-3.2-3B-Instruct" \
-            --embedding-model=all-MiniLM-L6-v2
+            --embedding-model=all-MiniLM-L6-v2 \
+            --color=yes \
+            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
 
       - name: Check Storage and Memory Available After Tests
         if: ${{ always() }}
diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py
index 2d6092e44..ecd29484b 100644
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@@ -37,26 +37,42 @@ def is_port_available(port: int, host: str = "localhost") -> bool:
 def start_llama_stack_server(config_name: str) -> subprocess.Popen:
     """Start a llama stack server with the given config."""
     cmd = ["llama", "stack", "run", config_name]
-
-    # Start server in background
-    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    devnull = open(os.devnull, "w")
+    process = subprocess.Popen(
+        cmd,
+        stdout=devnull,  # redirect stdout to devnull to prevent deadlock
+        stderr=devnull,  # redirect stderr to devnull to prevent deadlock
+        text=True,
+        env={**os.environ, "LLAMA_STACK_LOG_FILE": "server.log"},
+    )
     return process
 
 
-def wait_for_server_ready(base_url: str, timeout: int = 120) -> bool:
+def wait_for_server_ready(base_url: str, timeout: int = 30, process: subprocess.Popen | None = None) -> bool:
     """Wait for the server to be ready by polling the health endpoint."""
     health_url = f"{base_url}/v1/health"
     start_time = time.time()
 
     while time.time() - start_time < timeout:
+        if process and process.poll() is not None:
+            print(f"Server process terminated with return code: {process.returncode}")
+            return False
+
         try:
             response = requests.get(health_url, timeout=5)
             if response.status_code == 200:
                 return True
         except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
             pass
+
+        # Print progress every 5 seconds
+        elapsed = time.time() - start_time
+        if int(elapsed) % 5 == 0 and elapsed > 0:
+            print(f"Waiting for server at {base_url}... ({elapsed:.1f}s elapsed)")
+
         time.sleep(0.5)
 
+    print(f"Server failed to respond within {timeout} seconds")
     return False
 
 
@@ -179,11 +195,12 @@ def llama_stack_client(request, provider_data):
             server_process = start_llama_stack_server(config_name)
 
             # Wait for server to be ready
-            if not wait_for_server_ready(base_url, timeout=120):
+            if not wait_for_server_ready(base_url, timeout=30, process=server_process):
                 print("Server failed to start within timeout")
                 server_process.terminate()
                 raise RuntimeError(
-                    f"Server failed to start within timeout. Check that config '{config_name}' exists and is valid."
+                    f"Server failed to start within timeout. Check that config '{config_name}' exists and is valid. "
+                    f"See server.log for details."
                 )
 
             print(f"Server is ready at {base_url}")
@@ -227,3 +244,31 @@ def llama_stack_client(request, provider_data):
 def openai_client(client_with_models):
     base_url = f"{client_with_models.base_url}/v1/openai/v1"
     return OpenAI(base_url=base_url, api_key="fake")
+
+
+@pytest.fixture(scope="session", autouse=True)
+def cleanup_server_process(request):
+    """Cleanup server process at the end of the test session."""
+    yield  # Run tests
+
+    if hasattr(request.session, "_llama_stack_server_process"):
+        server_process = request.session._llama_stack_server_process
+        if server_process:
+            if server_process.poll() is None:
+                print("Terminating llama stack server process...")
+            else:
+                print(f"Server process already terminated with return code: {server_process.returncode}")
+                return
+            try:
+                server_process.terminate()
+                server_process.wait(timeout=10)
+                print("Server process terminated gracefully")
+            except subprocess.TimeoutExpired:
+                print("Server process did not terminate gracefully, killing it")
+                server_process.kill()
+                server_process.wait()
+                print("Server process killed")
+            except Exception as e:
+                print(f"Error during server cleanup: {e}")
+        else:
+            print("Server process not found - won't be able to cleanup")