feat(ci): make recording workflow simpler, more parameterizable

2025-12-17 18:19:51 +00:00 · 2025-08-15 11:50:08 -07:00 · 2025-08-15 11:50:08 -07:00 · 6fa5f2b7ec
commit 6fa5f2b7ec
parent a6e2c18909
4 changed files with 89 additions and 117 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -2,9 +2,13 @@ name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'

 inputs:
-  test-types:
-    description: 'JSON array of test types to run'
+  test-subdirs:
+    description: 'Comma-separated list of test subdirectories to run'
    required: true
+  test-pattern:
+    description: 'Regex pattern to pass to pytest -k'
+    required: false
+    default: ''
  stack-config:
    description: 'Stack configuration to use'
    required: true
@ -35,9 +39,11 @@ runs:
        ./scripts/integration-tests.sh \
          --stack-config '${{ inputs.stack-config }}' \
          --provider '${{ inputs.provider }}' \
-          --test-types '${{ inputs.test-types }}' \
+          --test-subdirs '${{ inputs.test-subdirs }}' \
+          --test-pattern '${{ inputs.test-pattern }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
+          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
+          | tee pytest-${{ inputs.inference-mode }}.log


    - name: Commit and push recordings
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -31,6 +31,14 @@ on:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
+      test-subdirs:
+        description: 'Comma-separated list of test subdirectories to run'
+        type: string
+        default: ''
+      test-pattern:
+        description: 'Regex pattern to pass to pytest -k'
+        type: string
+        default: ''

 concurrency:
  # Skip concurrency for pushes to main - each commit should be tested independently
@ -38,28 +46,8 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  discover-tests:
-    runs-on: ubuntu-latest
-    outputs:
-      test-types: ${{ steps.generate-test-types.outputs.test-types }}
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Generate test types
-        id: generate-test-types
-        run: |
-          # Get test directories dynamically, excluding non-test directories
-          # NOTE: we are excluding post_training since the tests take too long
-          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
-            sed 's|tests/integration/||' |
-            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
-            sort | jq -R -s -c 'split("\n")[:-1]')
-          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT

  run-replay-mode-tests:
-    needs: discover-tests
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}

@ -90,7 +78,8 @@ jobs:
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-types: ${{ needs.discover-tests.outputs.test-types }}
+          test-subdirs: ${{ inputs.test-subdirs }}
+          test-pattern: ${{ inputs.test-pattern }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          provider: ${{ matrix.provider }}
          inference-mode: 'replay'
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -1,88 +1,39 @@
+# This workflow should be run manually when needing to re-record tests. This happens when you have 
+#  - added a new test
+#  - or changed an existing test such that a new inference call is made
+# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the 
+# tests and commit the recordings to the PR branch.
 name: Integration Tests (Record)

 run-name: Run the integration test suite from tests/integration

 on:
-  pull_request_target:
-    branches: [ main ]
-    types: [opened, synchronize, labeled]
-    paths:
-      - 'llama_stack/**'
-      - 'tests/**'
-      - 'uv.lock'
-      - 'pyproject.toml'
-      - '.github/workflows/record-integration-tests.yml' # This workflow
-      - '.github/actions/setup-ollama/action.yml'
-      - '.github/actions/setup-test-environment/action.yml'
-      - '.github/actions/run-and-record-tests/action.yml'
  workflow_dispatch:
    inputs:
+      test-subdirs:
+        description: 'Comma-separated list of test subdirectories to run'
+        type: string
+        default: ''
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
-  cancel-in-progress: true
+      run-vision-tests:
+        description: 'Whether to run vision tests'
+        type: boolean
+        default: false
+      test-pattern:
+        description: 'Regex pattern to pass to pytest -k'
+        type: string
+        default: ''

 jobs:
-  discover-tests:
-    if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
-      contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
-    runs-on: ubuntu-latest
-    outputs:
-      test-types: ${{ steps.generate-test-types.outputs.test-types }}
-      matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Generate test types
-        id: generate-test-types
-        run: |
-          # Get test directories dynamically, excluding non-test directories
-          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
-            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
-            sort | jq -R -s -c 'split("\n")[:-1]')
-          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
-
-          labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
-          echo "labels=$labels"
-
-          modes_array=()
-          if [[ $labels == *"re-record-vision-tests"* ]]; then
-            modes_array+=("vision")
-          fi
-          if [[ $labels == *"re-record-tests"* ]]; then
-            modes_array+=("non-vision")
-          fi
-
-          # Convert to JSON array
-          if [ ${#modes_array[@]} -eq 0 ]; then
-            matrix_modes="[]"
-          else
-            matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
-          fi
-          echo "matrix_modes=$matrix_modes"
-          echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
-
-        env:
-          GH_TOKEN: ${{ github.token }}
-
  record-tests:
-    needs: discover-tests
    runs-on: ubuntu-latest

    permissions:
      contents: write

-    strategy:
-      fail-fast: false
-      matrix:
-        mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
-
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@ -96,14 +47,15 @@ jobs:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
+          run-vision-tests: ${{ inputs.run-vision-tests }}
          inference-mode: 'record'

      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-types: ${{ needs.discover-tests.outputs.test-types }}
+          test-pattern: ${{ inputs.test-pattern }}
+          test-subdirs: ${{ inputs.test-subdirs }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
-          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
+          run-vision-tests: ${{ inputs.run-vision-tests }}
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -14,7 +14,8 @@ set -euo pipefail
 # Default values
 STACK_CONFIG=""
 PROVIDER=""
-TEST_TYPES='["inference"]'
+TEST_SUBDIRS=""
+TEST_PATTERN=""
 RUN_VISION_TESTS="false"
 INFERENCE_MODE="replay"
 EXTRA_PARAMS=""
@ -27,23 +28,24 @@ Usage: $0 [OPTIONS]
 Options:
    --stack-config STRING    Stack configuration to use (required)
    --provider STRING        Provider to use (ollama, vllm, etc.) (required)
-    --test-types JSON        JSON array of test types to run (default: '["inference"]')
+    --test-subdirs STRING    Comma-separated list of test subdirectories to run (default: 'inference')
    --run-vision-tests       Run vision tests instead of regular tests
    --inference-mode STRING  Inference mode: record or replay (default: replay)
+    --test-pattern STRING    Regex pattern to pass to pytest -k 
    --help                   Show this help message

 Examples:
    # Basic inference tests with ollama
-    $0 --stack-config server:ollama --provider ollama
+    $0 --stack-config server:ci-tests --provider ollama

-    # Multiple test types with vllm
-    $0 --stack-config server:vllm --provider vllm --test-types '["inference", "agents"]'
+    # Multiple test directories with vllm
+    $0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents'

    # Vision tests with ollama
-    $0 --stack-config server:ollama --provider ollama --run-vision-tests
+    $0 --stack-config server:ci-tests --provider ollama --run-vision-tests

    # Record mode for updating test recordings
-    $0 --stack-config server:ollama --provider ollama --inference-mode record
+    $0 --stack-config server:ci-tests --provider ollama --inference-mode record
 EOF
 }

@ -58,8 +60,8 @@ while [[ $# -gt 0 ]]; do
            PROVIDER="$2"
            shift 2
            ;;
-        --test-types)
-            TEST_TYPES="$2"
+        --test-subdirs)
+            TEST_SUBDIRS="$2"
            shift 2
            ;;
        --run-vision-tests)
@ -70,6 +72,10 @@ while [[ $# -gt 0 ]]; do
            INFERENCE_MODE="$2"
            shift 2
            ;;
+        --test-pattern)
+            TEST_PATTERN="$2"
+            shift 2
+            ;;
        --help)
            usage
            exit 0
@ -99,9 +105,10 @@ fi
 echo "=== Llama Stack Integration Test Runner ==="
 echo "Stack Config: $STACK_CONFIG"
 echo "Provider: $PROVIDER"
-echo "Test Types: $TEST_TYPES"
+echo "Test Subdirs: $TEST_SUBDIRS"
 echo "Vision Tests: $RUN_VISION_TESTS"
 echo "Inference Mode: $INFERENCE_MODE"
+echo "Test Pattern: $TEST_PATTERN"
 echo ""

 # Check storage and memory before tests
@ -164,16 +171,21 @@ if [[ "$PROVIDER" == "vllm" ]]; then
    EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
 fi

+PYTEST_PATTERN="not( $EXCLUDE_TESTS )"
+if [[ -n "$TEST_PATTERN" ]]; then
+    PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
+fi
+
 # Run vision tests if specified
 if [[ "$RUN_VISION_TESTS" == "true" ]]; then
    echo "Running vision tests..."
    if uv run pytest -s -v tests/integration/inference/test_vision_inference.py \
        --stack-config="$STACK_CONFIG" \
-        -k "not( $EXCLUDE_TESTS )" \
+        -k "$PYTEST_PATTERN" \
        --vision-model=ollama/llama3.2-vision:11b \
        --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
        --color=yes $EXTRA_PARAMS \
-        --capture=tee-sys | tee pytest-${INFERENCE_MODE}-vision.log; then
+        --capture=tee-sys; then
        echo "✅ Vision tests completed successfully"
    else
        echo "❌ Vision tests failed"
@ -183,28 +195,34 @@ if [[ "$RUN_VISION_TESTS" == "true" ]]; then
 fi

 # Run regular tests
-echo "Test types to run: $TEST_TYPES"
+if [[ -z "$TEST_SUBDIRS" ]]; then
+   TEST_SUBDIRS=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
+            sed 's|tests/integration/||' |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
+            sort)
+fi
+echo "Test subdirs to run: $TEST_SUBDIRS"

 # Collect all test files for the specified test types
 TEST_FILES=""
-for test_type in $(echo "$TEST_TYPES" | jq -r '.[]'); do
+for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
    # Skip certain test types for vllm provider
    if [[ "$PROVIDER" == "vllm" ]]; then
-        if [[ "$test_type" == "safety" ]] || [[ "$test_type" == "post_training" ]] || [[ "$test_type" == "tool_runtime" ]]; then
-            echo "Skipping $test_type for vllm provider"
+        if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then
+            echo "Skipping $test_subdir for vllm provider"
            continue
        fi
    fi

-    if [[ -d "tests/integration/$test_type" ]]; then
+    if [[ -d "tests/integration/$test_subdir" ]]; then
        # Find all Python test files in this directory
-        test_files=$(find tests/integration/$test_type -name "test_*.py" -o -name "*_test.py")
+        test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
        if [[ -n "$test_files" ]]; then
            TEST_FILES="$TEST_FILES $test_files"
-            echo "Added test files from $test_type: $(echo $test_files | wc -w) files"
+            echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
        fi
    else
-        echo "Warning: Directory tests/integration/$test_type does not exist"
+        echo "Warning: Directory tests/integration/$test_subdir does not exist"
    fi
 done

@ -217,14 +235,21 @@ echo ""
 echo "=== Running all collected tests in a single pytest command ==="
 echo "Total test files: $(echo $TEST_FILES | wc -w)"

-if uv run pytest -s -v $TEST_FILES \
+set +e
+uv run pytest -s -v $TEST_FILES \
    --stack-config="$STACK_CONFIG" \
-    -k "not( $EXCLUDE_TESTS )" \
+    -k "$PYTEST_PATTERN" \
    --text-model="$TEXT_MODEL" \
    --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
    --color=yes $EXTRA_PARAMS \
-    --capture=tee-sys | tee pytest-${INFERENCE_MODE}-all.log; then
+    --capture=tee-sys
+exit_code=$?
+set -e
+
+if [ $exit_code -eq 0 ]; then
    echo "✅ All tests completed successfully"
+elif [ $exit_code -eq 5 ]; then
+    echo "⚠️ No tests collected (pattern matched no tests)"
 else
    echo "❌ Tests failed"
    exit 1