From b440a1dc4209c07ba53997f38321fd2c3590eea1 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Mon, 31 Mar 2025 13:38:47 -0700
Subject: [PATCH] test: make sure integration tests runs against the server
 (#1743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the integration tests started the server, but never really
used it because `--stack-config=ollama` uses the ollama template and the
inline "llama stack as library" client, not the HTTP client.

This PR makes sure we test it both ways.

We also add agents tests to the mix.

## Test Plan

GitHub

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/integration-tests.yml         | 17 +++++++++++++++--
 .../inference/test_text_inference.py            |  2 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 6e7e99ef9..e03c7401c 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -25,7 +25,8 @@ jobs:
       matrix:
         # Listing tests manually since some of them currently fail
         # TODO: generate matrix list from tests/integration when fixed
-        test-type: [inference, datasets, inspect, scoring, post_training, providers]
+        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers]
+        client-type: [library, http]
       fail-fast: false # we want to run all tests regardless of failure
 
     steps:
@@ -54,6 +55,8 @@ jobs:
           uv sync --extra dev --extra test
           uv pip install ollama faiss-cpu
           # always test against the latest version of the client
+          # TODO: this is not necessarily a good idea. we need to test against both published and latest
+          # to find out backwards compatibility issues.
           uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
           uv pip install -e .
           llama stack build --template ollama --image-type venv
@@ -74,6 +77,7 @@ jobs:
           exit 1
 
       - name: Start Llama Stack server in background
+        if: matrix.client-type == 'http'
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
@@ -81,6 +85,7 @@ jobs:
           nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
 
       - name: Wait for Llama Stack server to be ready
+        if: matrix.client-type == 'http'
         run: |
           echo "Waiting for Llama Stack server..."
           for i in {1..30}; do
@@ -98,4 +103,12 @@ jobs:
         env:
           INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
         run: |
-          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="ollama"
+          else
+            stack_config="http://localhost:8321"
+          fi
+          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
+            --embedding-model=all-MiniLM-L6-v2
diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py
index f558254e5..c1eec35f0 100644
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@@ -275,6 +275,7 @@ def test_text_chat_completion_first_token_profiling(client_with_models, text_mod
         model_id=text_model_id,
         messages=messages,
         stream=False,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
     )
     message_content = response.completion_message.content.lower().strip()
     assert len(message_content) > 0
@@ -301,6 +302,7 @@ def test_text_chat_completion_streaming(client_with_models, text_model_id, test_
         model_id=text_model_id,
         messages=[{"role": "user", "content": question}],
         stream=True,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
     )
     streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
     assert len(streamed_content) > 0