feat(ci): add support for running vision inference tests (#2972)

This PR significantly refactors the Integration Tests workflow. The main
goal behind the PR was to enable recording of vision tests which were
never run as part of our CI ever before. During debugging, I ended up
making several other changes refactoring and hopefully increasing the
robustness of the workflow.

After doing the experiments, I have updated the trigger event to be
`pull_request_target` so this workflow can get write permissions by
default but it will run with source code from the base (main) branch in
the source repository only. If you do change the workflow, you'd need to
experiment using the `workflow_dispatch` triggers. This should not be
news to anyone using Github Actions (except me!)

It is likely to be a little rocky though while I learn more about GitHub
Actions, etc. Please be patient :)

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Ashwin Bharambe 2025-07-31 11:50:42 -07:00 committed by GitHub
parent 709c974bd8
commit 27d866795c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
108 changed files with 13985 additions and 15254 deletions

View file

@ -0,0 +1,198 @@
name: 'Run and Record Tests'
description: 'Run integration tests and handle recording/artifact upload'
inputs:
test-types:
description: 'JSON array of test types to run'
required: true
stack-config:
description: 'Stack configuration to use'
required: true
provider:
description: 'Provider to use for tests'
required: true
inference-mode:
description: 'Inference mode (record or replay)'
required: true
run-vision-tests:
description: 'Whether to run vision tests'
required: false
default: 'false'
runs:
using: 'composite'
steps:
- name: Check Storage and Memory Available Before Tests
if: ${{ always() }}
shell: bash
run: |
free -h
df -h
- name: Set environment variables
shell: bash
run: |
echo "LLAMA_STACK_CLIENT_TIMEOUT=300" >> $GITHUB_ENV
echo "LLAMA_STACK_TEST_INFERENCE_MODE=${{ inputs.inference-mode }}" >> $GITHUB_ENV
# Configure provider-specific settings
if [ "${{ inputs.provider }}" == "ollama" ]; then
echo "OLLAMA_URL=http://0.0.0.0:11434" >> $GITHUB_ENV
echo "TEXT_MODEL=ollama/llama3.2:3b-instruct-fp16" >> $GITHUB_ENV
echo "SAFETY_MODEL=ollama/llama-guard3:1b" >> $GITHUB_ENV
else
echo "VLLM_URL=http://localhost:8000/v1" >> $GITHUB_ENV
echo "TEXT_MODEL=vllm/meta-llama/Llama-3.2-1B-Instruct" >> $GITHUB_ENV
fi
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
echo "LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings/vision" >> $GITHUB_ENV
else
echo "LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings" >> $GITHUB_ENV
fi
- name: Run Llama Stack Server
if: ${{ contains(inputs.stack-config, 'server:') }}
shell: bash
run: |
# Run this so pytest in a loop doesn't start-stop servers in a loop
echo "Starting Llama Stack Server"
nohup uv run llama stack run ci-tests --image-type venv > server.log 2>&1 &
echo "Waiting for Llama Stack Server to start"
for i in {1..30}; do
if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
echo "Llama Stack Server started"
exit 0
fi
sleep 1
done
echo "Llama Stack Server failed to start"
cat server.log
exit 1
- name: Run Integration Tests
shell: bash
run: |
stack_config="${{ inputs.stack-config }}"
EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
# Configure provider-specific settings
if [ "${{ inputs.provider }}" == "ollama" ]; then
EXTRA_PARAMS="--safety-shield=llama-guard"
else
EXTRA_PARAMS=""
EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
fi
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
if uv run pytest -s -v tests/integration/inference/test_vision_inference.py --stack-config=${stack_config} \
-k "not( ${EXCLUDE_TESTS} )" \
--vision-model=ollama/llama3.2-vision:11b \
--embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
--color=yes ${EXTRA_PARAMS} \
--capture=tee-sys | tee pytest-${{ inputs.inference-mode }}-vision.log; then
echo "✅ Tests completed for vision"
else
echo "❌ Tests failed for vision"
exit 1
fi
exit 0
fi
# Run non-vision tests
TEST_TYPES='${{ inputs.test-types }}'
echo "Test types to run: $TEST_TYPES"
# Collect all test files for the specified test types
TEST_FILES=""
for test_type in $(echo "$TEST_TYPES" | jq -r '.[]'); do
# if provider is vllm, exclude the following tests: (safety, post_training, tool_runtime)
if [ "${{ inputs.provider }}" == "vllm" ]; then
if [ "$test_type" == "safety" ] || [ "$test_type" == "post_training" ] || [ "$test_type" == "tool_runtime" ]; then
echo "Skipping $test_type for vllm provider"
continue
fi
fi
if [ -d "tests/integration/$test_type" ]; then
# Find all Python test files in this directory
test_files=$(find tests/integration/$test_type -name "test_*.py" -o -name "*_test.py")
if [ -n "$test_files" ]; then
TEST_FILES="$TEST_FILES $test_files"
echo "Added test files from $test_type: $(echo $test_files | wc -w) files"
fi
else
echo "Warning: Directory tests/integration/$test_type does not exist"
fi
done
if [ -z "$TEST_FILES" ]; then
echo "No test files found for the specified test types"
exit 1
fi
echo "=== Running all collected tests in a single pytest command ==="
echo "Total test files: $(echo $TEST_FILES | wc -w)"
if uv run pytest -s -v $TEST_FILES --stack-config=${stack_config} \
-k "not( ${EXCLUDE_TESTS} )" \
--text-model=$TEXT_MODEL \
--embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
--color=yes ${EXTRA_PARAMS} \
--capture=tee-sys | tee pytest-${{ inputs.inference-mode }}-all.log; then
echo "✅ All tests completed successfully"
else
echo "❌ Tests failed"
exit 1
fi
- name: Check Storage and Memory Available After Tests
if: ${{ always() }}
shell: bash
run: |
free -h
df -h
- name: Commit and push recordings
if: ${{ inputs.inference-mode == 'record' }}
shell: bash
run: |
echo "Checking for recording changes"
git status --porcelain tests/integration/recordings/
if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
echo "New recordings detected, committing and pushing"
git add tests/integration/recordings/
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
git commit -m "Recordings update from CI (vision)"
else
git commit -m "Recordings update from CI"
fi
git fetch origin ${{ github.event.pull_request.head.ref }}
git rebase origin/${{ github.event.pull_request.head.ref }}
echo "Rebased successfully"
git push origin HEAD:${{ github.event.pull_request.head.ref }}
echo "Pushed successfully"
else
echo "No recording changes"
fi
- name: Write inference logs to file
if: ${{ always() }}
shell: bash
run: |
sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
- name: Upload logs
if: ${{ always() }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: ${{ inputs.inference-mode }}-logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ inputs.provider }}
path: |
*.log
retention-days: 1