feat(ci): keep only one re-recording job because independent recordings will conflict (#2956)

A couple of important updates:

- When recording tests, we cannot be generating a matrix because all the
independent recordings will conflict.
- In fact, we just don't need a matrix on test types any more because
things are very fast and the overhead of `llama stack build` and setting
up `uv` etc. is much more.
- Refactored the running of tests into an independent action
This commit is contained in:
Ashwin Bharambe 2025-07-29 17:48:04 -07:00 committed by GitHub
parent b237df8f18
commit f6afb3c26b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 158 additions and 103 deletions

View file

@ -0,0 +1,73 @@
name: 'Run Integration Tests'
description: 'Run integration tests with configurable execution mode and provider settings'
inputs:
test-types:
description: 'Test types to run (JSON array)'
required: true
stack-config:
description: 'Stack configuration: "ci-tests" or "server:ci-tests"'
required: true
provider:
description: 'Provider to use: "ollama" or "vllm"'
required: true
inference-mode:
description: 'Inference mode: "record" or "replay"'
required: true
outputs:
logs-path:
description: 'Path to generated log files'
value: '*.log'
runs:
using: 'composite'
steps:
- name: Run Integration Tests
env:
LLAMA_STACK_CLIENT_TIMEOUT: "300"
LLAMA_STACK_TEST_RECORDING_DIR: "tests/integration/recordings"
LLAMA_STACK_TEST_INFERENCE_MODE: ${{ inputs.inference-mode }}
shell: bash
run: |
stack_config="${{ inputs.stack-config }}"
EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
# Configure provider-specific settings
if [ "${{ inputs.provider }}" == "ollama" ]; then
export OLLAMA_URL="http://0.0.0.0:11434"
export TEXT_MODEL="ollama/llama3.2:3b-instruct-fp16"
export SAFETY_MODEL="ollama/llama-guard3:1b"
EXTRA_PARAMS="--safety-shield=llama-guard"
else
export VLLM_URL="http://localhost:8000/v1"
export TEXT_MODEL="vllm/meta-llama/Llama-3.2-1B-Instruct"
EXTRA_PARAMS=""
EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
fi
TEST_TYPES='${{ inputs.test-types }}'
echo "Test types to run: $TEST_TYPES"
for test_type in $(echo "$TEST_TYPES" | jq -r '.[]'); do
# if provider is vllm, exclude the following tests: (safety, post_training, tool_runtime)
if [ "${{ inputs.provider }}" == "vllm" ]; then
if [ "$test_type" == "safety" ] || [ "$test_type" == "post_training" ] || [ "$test_type" == "tool_runtime" ]; then
continue
fi
fi
echo "=== Running tests for: $test_type ==="
if uv run pytest -s -v tests/integration/$test_type --stack-config=${stack_config} \
-k "not( ${EXCLUDE_TESTS} )" \
--text-model=$TEXT_MODEL \
--embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
--color=yes ${EXTRA_PARAMS} \
--capture=tee-sys | tee pytest-${{ inputs.inference-mode }}-$test_type.log; then
echo "✅ Tests completed for $test_type"
else
echo "❌ Tests failed for $test_type"
exit 1
fi
done