From 6fa5f2b7ec62f220dd13e47e36d266d517dba2dc Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 15 Aug 2025 11:50:08 -0700 Subject: [PATCH] feat(ci): make recording workflow simpler, more parameterizable --- .../actions/run-and-record-tests/action.yml | 14 ++- .github/workflows/integration-tests.yml | 31 +++---- .../workflows/record-integration-tests.yml | 90 +++++-------------- scripts/integration-tests.sh | 71 ++++++++++----- 4 files changed, 89 insertions(+), 117 deletions(-) diff --git a/.github/actions/run-and-record-tests/action.yml b/.github/actions/run-and-record-tests/action.yml index 573148e46..a5976c5c8 100644 --- a/.github/actions/run-and-record-tests/action.yml +++ b/.github/actions/run-and-record-tests/action.yml @@ -2,9 +2,13 @@ name: 'Run and Record Tests' description: 'Run integration tests and handle recording/artifact upload' inputs: - test-types: - description: 'JSON array of test types to run' + test-subdirs: + description: 'Comma-separated list of test subdirectories to run' required: true + test-pattern: + description: 'Regex pattern to pass to pytest -k' + required: false + default: '' stack-config: description: 'Stack configuration to use' required: true @@ -35,9 +39,11 @@ runs: ./scripts/integration-tests.sh \ --stack-config '${{ inputs.stack-config }}' \ --provider '${{ inputs.provider }}' \ - --test-types '${{ inputs.test-types }}' \ + --test-subdirs '${{ inputs.test-subdirs }}' \ + --test-pattern '${{ inputs.test-pattern }}' \ --inference-mode '${{ inputs.inference-mode }}' \ - ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} + ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \ + | tee pytest-${{ inputs.inference-mode }}.log - name: Commit and push recordings diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 9ef49fba3..fc56f62ea 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -31,6 +31,14 @@ on: description: 'Test against a specific provider' type: string default: 'ollama' + test-subdirs: + description: 'Comma-separated list of test subdirectories to run' + type: string + default: '' + test-pattern: + description: 'Regex pattern to pass to pytest -k' + type: string + default: '' concurrency: # Skip concurrency for pushes to main - each commit should be tested independently @@ -38,28 +46,8 @@ concurrency: cancel-in-progress: true jobs: - discover-tests: - runs-on: ubuntu-latest - outputs: - test-types: ${{ steps.generate-test-types.outputs.test-types }} - - steps: - - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Generate test types - id: generate-test-types - run: | - # Get test directories dynamically, excluding non-test directories - # NOTE: we are excluding post_training since the tests take too long - TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d | - sed 's|tests/integration/||' | - grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" | - sort | jq -R -s -c 'split("\n")[:-1]') - echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT run-replay-mode-tests: - needs: discover-tests runs-on: ubuntu-latest name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }} @@ -90,7 +78,8 @@ jobs: - name: Run tests uses: ./.github/actions/run-and-record-tests with: - test-types: ${{ needs.discover-tests.outputs.test-types }} + test-subdirs: ${{ inputs.test-subdirs }} + test-pattern: ${{ inputs.test-pattern }} stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }} provider: ${{ matrix.provider }} inference-mode: 'replay' diff --git a/.github/workflows/record-integration-tests.yml b/.github/workflows/record-integration-tests.yml index b31709a4f..4fe244e8e 100644 --- a/.github/workflows/record-integration-tests.yml +++ b/.github/workflows/record-integration-tests.yml @@ -1,88 +1,39 @@ +# This workflow should be run manually when needing to re-record tests. This happens when you have +# - added a new test +# - or changed an existing test such that a new inference call is made +# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the +# tests and commit the recordings to the PR branch. name: Integration Tests (Record) run-name: Run the integration test suite from tests/integration on: - pull_request_target: - branches: [ main ] - types: [opened, synchronize, labeled] - paths: - - 'llama_stack/**' - - 'tests/**' - - 'uv.lock' - - 'pyproject.toml' - - '.github/workflows/record-integration-tests.yml' # This workflow - - '.github/actions/setup-ollama/action.yml' - - '.github/actions/setup-test-environment/action.yml' - - '.github/actions/run-and-record-tests/action.yml' workflow_dispatch: inputs: + test-subdirs: + description: 'Comma-separated list of test subdirectories to run' + type: string + default: '' test-provider: description: 'Test against a specific provider' type: string default: 'ollama' - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number }} - cancel-in-progress: true + run-vision-tests: + description: 'Whether to run vision tests' + type: boolean + default: false + test-pattern: + description: 'Regex pattern to pass to pytest -k' + type: string + default: '' jobs: - discover-tests: - if: contains(github.event.pull_request.labels.*.name, 're-record-tests') || - contains(github.event.pull_request.labels.*.name, 're-record-vision-tests') - runs-on: ubuntu-latest - outputs: - test-types: ${{ steps.generate-test-types.outputs.test-types }} - matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }} - - steps: - - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Generate test types - id: generate-test-types - run: | - # Get test directories dynamically, excluding non-test directories - TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | - grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" | - sort | jq -R -s -c 'split("\n")[:-1]') - echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT - - labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name') - echo "labels=$labels" - - modes_array=() - if [[ $labels == *"re-record-vision-tests"* ]]; then - modes_array+=("vision") - fi - if [[ $labels == *"re-record-tests"* ]]; then - modes_array+=("non-vision") - fi - - # Convert to JSON array - if [ ${#modes_array[@]} -eq 0 ]; then - matrix_modes="[]" - else - matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]') - fi - echo "matrix_modes=$matrix_modes" - echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT - - env: - GH_TOKEN: ${{ github.token }} - record-tests: - needs: discover-tests runs-on: ubuntu-latest permissions: contents: write - strategy: - fail-fast: false - matrix: - mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }} - steps: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -96,14 +47,15 @@ jobs: python-version: "3.12" # Use single Python version for recording client-version: "latest" provider: ${{ inputs.test-provider || 'ollama' }} - run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }} + run-vision-tests: ${{ inputs.run-vision-tests }} inference-mode: 'record' - name: Run and record tests uses: ./.github/actions/run-and-record-tests with: - test-types: ${{ needs.discover-tests.outputs.test-types }} + test-pattern: ${{ inputs.test-pattern }} + test-subdirs: ${{ inputs.test-subdirs }} stack-config: 'server:ci-tests' # recording must be done with server since more tests are run provider: ${{ inputs.test-provider || 'ollama' }} inference-mode: 'record' - run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }} + run-vision-tests: ${{ inputs.run-vision-tests }} diff --git a/scripts/integration-tests.sh b/scripts/integration-tests.sh index 8dbbcae90..525baf5de 100755 --- a/scripts/integration-tests.sh +++ b/scripts/integration-tests.sh @@ -14,7 +14,8 @@ set -euo pipefail # Default values STACK_CONFIG="" PROVIDER="" -TEST_TYPES='["inference"]' +TEST_SUBDIRS="" +TEST_PATTERN="" RUN_VISION_TESTS="false" INFERENCE_MODE="replay" EXTRA_PARAMS="" @@ -27,23 +28,24 @@ Usage: $0 [OPTIONS] Options: --stack-config STRING Stack configuration to use (required) --provider STRING Provider to use (ollama, vllm, etc.) (required) - --test-types JSON JSON array of test types to run (default: '["inference"]') + --test-subdirs STRING Comma-separated list of test subdirectories to run (default: 'inference') --run-vision-tests Run vision tests instead of regular tests --inference-mode STRING Inference mode: record or replay (default: replay) + --test-pattern STRING Regex pattern to pass to pytest -k --help Show this help message Examples: # Basic inference tests with ollama - $0 --stack-config server:ollama --provider ollama + $0 --stack-config server:ci-tests --provider ollama - # Multiple test types with vllm - $0 --stack-config server:vllm --provider vllm --test-types '["inference", "agents"]' + # Multiple test directories with vllm + $0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents' # Vision tests with ollama - $0 --stack-config server:ollama --provider ollama --run-vision-tests + $0 --stack-config server:ci-tests --provider ollama --run-vision-tests # Record mode for updating test recordings - $0 --stack-config server:ollama --provider ollama --inference-mode record + $0 --stack-config server:ci-tests --provider ollama --inference-mode record EOF } @@ -58,8 +60,8 @@ while [[ $# -gt 0 ]]; do PROVIDER="$2" shift 2 ;; - --test-types) - TEST_TYPES="$2" + --test-subdirs) + TEST_SUBDIRS="$2" shift 2 ;; --run-vision-tests) @@ -70,6 +72,10 @@ while [[ $# -gt 0 ]]; do INFERENCE_MODE="$2" shift 2 ;; + --test-pattern) + TEST_PATTERN="$2" + shift 2 + ;; --help) usage exit 0 @@ -99,9 +105,10 @@ fi echo "=== Llama Stack Integration Test Runner ===" echo "Stack Config: $STACK_CONFIG" echo "Provider: $PROVIDER" -echo "Test Types: $TEST_TYPES" +echo "Test Subdirs: $TEST_SUBDIRS" echo "Vision Tests: $RUN_VISION_TESTS" echo "Inference Mode: $INFERENCE_MODE" +echo "Test Pattern: $TEST_PATTERN" echo "" # Check storage and memory before tests @@ -164,16 +171,21 @@ if [[ "$PROVIDER" == "vllm" ]]; then EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls" fi +PYTEST_PATTERN="not( $EXCLUDE_TESTS )" +if [[ -n "$TEST_PATTERN" ]]; then + PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN" +fi + # Run vision tests if specified if [[ "$RUN_VISION_TESTS" == "true" ]]; then echo "Running vision tests..." if uv run pytest -s -v tests/integration/inference/test_vision_inference.py \ --stack-config="$STACK_CONFIG" \ - -k "not( $EXCLUDE_TESTS )" \ + -k "$PYTEST_PATTERN" \ --vision-model=ollama/llama3.2-vision:11b \ --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \ --color=yes $EXTRA_PARAMS \ - --capture=tee-sys | tee pytest-${INFERENCE_MODE}-vision.log; then + --capture=tee-sys; then echo "✅ Vision tests completed successfully" else echo "❌ Vision tests failed" @@ -183,28 +195,34 @@ if [[ "$RUN_VISION_TESTS" == "true" ]]; then fi # Run regular tests -echo "Test types to run: $TEST_TYPES" +if [[ -z "$TEST_SUBDIRS" ]]; then + TEST_SUBDIRS=$(find tests/integration -maxdepth 1 -mindepth 1 -type d | + sed 's|tests/integration/||' | + grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" | + sort) +fi +echo "Test subdirs to run: $TEST_SUBDIRS" # Collect all test files for the specified test types TEST_FILES="" -for test_type in $(echo "$TEST_TYPES" | jq -r '.[]'); do +for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do # Skip certain test types for vllm provider if [[ "$PROVIDER" == "vllm" ]]; then - if [[ "$test_type" == "safety" ]] || [[ "$test_type" == "post_training" ]] || [[ "$test_type" == "tool_runtime" ]]; then - echo "Skipping $test_type for vllm provider" + if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then + echo "Skipping $test_subdir for vllm provider" continue fi fi - if [[ -d "tests/integration/$test_type" ]]; then + if [[ -d "tests/integration/$test_subdir" ]]; then # Find all Python test files in this directory - test_files=$(find tests/integration/$test_type -name "test_*.py" -o -name "*_test.py") + test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py") if [[ -n "$test_files" ]]; then TEST_FILES="$TEST_FILES $test_files" - echo "Added test files from $test_type: $(echo $test_files | wc -w) files" + echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files" fi else - echo "Warning: Directory tests/integration/$test_type does not exist" + echo "Warning: Directory tests/integration/$test_subdir does not exist" fi done @@ -217,14 +235,21 @@ echo "" echo "=== Running all collected tests in a single pytest command ===" echo "Total test files: $(echo $TEST_FILES | wc -w)" -if uv run pytest -s -v $TEST_FILES \ +set +e +uv run pytest -s -v $TEST_FILES \ --stack-config="$STACK_CONFIG" \ - -k "not( $EXCLUDE_TESTS )" \ + -k "$PYTEST_PATTERN" \ --text-model="$TEXT_MODEL" \ --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \ --color=yes $EXTRA_PARAMS \ - --capture=tee-sys | tee pytest-${INFERENCE_MODE}-all.log; then + --capture=tee-sys +exit_code=$? +set -e + +if [ $exit_code -eq 0 ]; then echo "✅ All tests completed successfully" +elif [ $exit_code -eq 5 ]; then + echo "⚠️ No tests collected (pattern matched no tests)" else echo "❌ Tests failed" exit 1