test: make sure integration tests runs against the server (#1743)

Previously, the integration tests started the server, but never really
used it because `--stack-config=ollama` uses the ollama template and the
inline "llama stack as library" client, not the HTTP client.

This PR makes sure we test it both ways.

We also add agents tests to the mix.

## Test Plan 

GitHub

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
Ashwin Bharambe 2025-03-31 13:38:47 -07:00 committed by GitHub
parent 2ffa2b77ed
commit b440a1dc42
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 17 additions and 2 deletions

View file

@ -25,7 +25,8 @@ jobs:
matrix: matrix:
# Listing tests manually since some of them currently fail # Listing tests manually since some of them currently fail
# TODO: generate matrix list from tests/integration when fixed # TODO: generate matrix list from tests/integration when fixed
test-type: [inference, datasets, inspect, scoring, post_training, providers] test-type: [agents, inference, datasets, inspect, scoring, post_training, providers]
client-type: [library, http]
fail-fast: false # we want to run all tests regardless of failure fail-fast: false # we want to run all tests regardless of failure
steps: steps:
@ -54,6 +55,8 @@ jobs:
uv sync --extra dev --extra test uv sync --extra dev --extra test
uv pip install ollama faiss-cpu uv pip install ollama faiss-cpu
# always test against the latest version of the client # always test against the latest version of the client
# TODO: this is not necessarily a good idea. we need to test against both published and latest
# to find out backwards compatibility issues.
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
uv pip install -e . uv pip install -e .
llama stack build --template ollama --image-type venv llama stack build --template ollama --image-type venv
@ -74,6 +77,7 @@ jobs:
exit 1 exit 1
- name: Start Llama Stack server in background - name: Start Llama Stack server in background
if: matrix.client-type == 'http'
env: env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: | run: |
@ -81,6 +85,7 @@ jobs:
nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 & nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
- name: Wait for Llama Stack server to be ready - name: Wait for Llama Stack server to be ready
if: matrix.client-type == 'http'
run: | run: |
echo "Waiting for Llama Stack server..." echo "Waiting for Llama Stack server..."
for i in {1..30}; do for i in {1..30}; do
@ -98,4 +103,12 @@ jobs:
env: env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: | run: |
uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2 if [ "${{ matrix.client-type }}" == "library" ]; then
stack_config="ollama"
else
stack_config="http://localhost:8321"
fi
uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
-k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
--text-model="meta-llama/Llama-3.2-3B-Instruct" \
--embedding-model=all-MiniLM-L6-v2

View file

@ -275,6 +275,7 @@ def test_text_chat_completion_first_token_profiling(client_with_models, text_mod
model_id=text_model_id, model_id=text_model_id,
messages=messages, messages=messages,
stream=False, stream=False,
timeout=120, # Increase timeout to 2 minutes for large conversation history
) )
message_content = response.completion_message.content.lower().strip() message_content = response.completion_message.content.lower().strip()
assert len(message_content) > 0 assert len(message_content) > 0
@ -301,6 +302,7 @@ def test_text_chat_completion_streaming(client_with_models, text_model_id, test_
model_id=text_model_id, model_id=text_model_id,
messages=[{"role": "user", "content": question}], messages=[{"role": "user", "content": question}],
stream=True, stream=True,
timeout=120, # Increase timeout to 2 minutes for large conversation history
) )
streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response] streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
assert len(streamed_content) > 0 assert len(streamed_content) > 0