diff --git a/.github/actions/run-and-record-tests/action.yml b/.github/actions/run-and-record-tests/action.yml index 573148e46..1406c6077 100644 --- a/.github/actions/run-and-record-tests/action.yml +++ b/.github/actions/run-and-record-tests/action.yml @@ -2,9 +2,13 @@ name: 'Run and Record Tests' description: 'Run integration tests and handle recording/artifact upload' inputs: - test-types: - description: 'JSON array of test types to run' + test-subdirs: + description: 'Comma-separated list of test subdirectories to run' required: true + test-pattern: + description: 'Regex pattern to pass to pytest -k' + required: false + default: '' stack-config: description: 'Stack configuration to use' required: true @@ -35,9 +39,11 @@ runs: ./scripts/integration-tests.sh \ --stack-config '${{ inputs.stack-config }}' \ --provider '${{ inputs.provider }}' \ - --test-types '${{ inputs.test-types }}' \ + --test-subdirs '${{ inputs.test-subdirs }}' \ + --test-pattern '${{ inputs.test-pattern }}' \ --inference-mode '${{ inputs.inference-mode }}' \ - ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} + ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \ + | tee pytest-${{ inputs.inference-mode }}.log - name: Commit and push recordings @@ -57,10 +63,10 @@ runs: git commit -m "Recordings update from CI" fi - git fetch origin ${{ github.event.pull_request.head.ref }} - git rebase origin/${{ github.event.pull_request.head.ref }} + git fetch origin ${{ github.ref_name }} + git rebase origin/${{ github.ref_name }} echo "Rebased successfully" - git push origin HEAD:${{ github.event.pull_request.head.ref }} + git push origin HEAD:${{ github.ref_name }} echo "Pushed successfully" else echo "No recording changes" diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 9ef49fba3..fc56f62ea 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -31,6 +31,14 @@ on: description: 'Test against a specific provider' type: string default: 'ollama' + test-subdirs: + description: 'Comma-separated list of test subdirectories to run' + type: string + default: '' + test-pattern: + description: 'Regex pattern to pass to pytest -k' + type: string + default: '' concurrency: # Skip concurrency for pushes to main - each commit should be tested independently @@ -38,28 +46,8 @@ concurrency: cancel-in-progress: true jobs: - discover-tests: - runs-on: ubuntu-latest - outputs: - test-types: ${{ steps.generate-test-types.outputs.test-types }} - - steps: - - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Generate test types - id: generate-test-types - run: | - # Get test directories dynamically, excluding non-test directories - # NOTE: we are excluding post_training since the tests take too long - TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d | - sed 's|tests/integration/||' | - grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" | - sort | jq -R -s -c 'split("\n")[:-1]') - echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT run-replay-mode-tests: - needs: discover-tests runs-on: ubuntu-latest name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }} @@ -90,7 +78,8 @@ jobs: - name: Run tests uses: ./.github/actions/run-and-record-tests with: - test-types: ${{ needs.discover-tests.outputs.test-types }} + test-subdirs: ${{ inputs.test-subdirs }} + test-pattern: ${{ inputs.test-pattern }} stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }} provider: ${{ matrix.provider }} inference-mode: 'replay' diff --git a/.github/workflows/record-integration-tests.yml b/.github/workflows/record-integration-tests.yml index b31709a4f..95403291c 100644 --- a/.github/workflows/record-integration-tests.yml +++ b/.github/workflows/record-integration-tests.yml @@ -1,93 +1,43 @@ +# This workflow should be run manually when needing to re-record tests. This happens when you have +# - added a new test +# - or changed an existing test such that a new inference call is made +# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the +# tests and commit the recordings to the PR branch. name: Integration Tests (Record) run-name: Run the integration test suite from tests/integration on: - pull_request_target: - branches: [ main ] - types: [opened, synchronize, labeled] - paths: - - 'llama_stack/**' - - 'tests/**' - - 'uv.lock' - - 'pyproject.toml' - - '.github/workflows/record-integration-tests.yml' # This workflow - - '.github/actions/setup-ollama/action.yml' - - '.github/actions/setup-test-environment/action.yml' - - '.github/actions/run-and-record-tests/action.yml' workflow_dispatch: inputs: + test-subdirs: + description: 'Comma-separated list of test subdirectories to run' + type: string + default: '' test-provider: description: 'Test against a specific provider' type: string default: 'ollama' - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number }} - cancel-in-progress: true + run-vision-tests: + description: 'Whether to run vision tests' + type: boolean + default: false + test-pattern: + description: 'Regex pattern to pass to pytest -k' + type: string + default: '' jobs: - discover-tests: - if: contains(github.event.pull_request.labels.*.name, 're-record-tests') || - contains(github.event.pull_request.labels.*.name, 're-record-vision-tests') - runs-on: ubuntu-latest - outputs: - test-types: ${{ steps.generate-test-types.outputs.test-types }} - matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }} - - steps: - - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Generate test types - id: generate-test-types - run: | - # Get test directories dynamically, excluding non-test directories - TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | - grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" | - sort | jq -R -s -c 'split("\n")[:-1]') - echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT - - labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name') - echo "labels=$labels" - - modes_array=() - if [[ $labels == *"re-record-vision-tests"* ]]; then - modes_array+=("vision") - fi - if [[ $labels == *"re-record-tests"* ]]; then - modes_array+=("non-vision") - fi - - # Convert to JSON array - if [ ${#modes_array[@]} -eq 0 ]; then - matrix_modes="[]" - else - matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]') - fi - echo "matrix_modes=$matrix_modes" - echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT - - env: - GH_TOKEN: ${{ github.token }} - record-tests: - needs: discover-tests runs-on: ubuntu-latest permissions: contents: write - strategy: - fail-fast: false - matrix: - mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }} - steps: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - ref: ${{ github.event.pull_request.head.ref }} fetch-depth: 0 - name: Setup test environment @@ -96,14 +46,15 @@ jobs: python-version: "3.12" # Use single Python version for recording client-version: "latest" provider: ${{ inputs.test-provider || 'ollama' }} - run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }} + run-vision-tests: ${{ inputs.run-vision-tests }} inference-mode: 'record' - name: Run and record tests uses: ./.github/actions/run-and-record-tests with: - test-types: ${{ needs.discover-tests.outputs.test-types }} + test-pattern: ${{ inputs.test-pattern }} + test-subdirs: ${{ inputs.test-subdirs }} stack-config: 'server:ci-tests' # recording must be done with server since more tests are run provider: ${{ inputs.test-provider || 'ollama' }} inference-mode: 'record' - run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }} + run-vision-tests: ${{ inputs.run-vision-tests }} diff --git a/scripts/integration-tests.sh b/scripts/integration-tests.sh index 8dbbcae90..95b78e271 100755 --- a/scripts/integration-tests.sh +++ b/scripts/integration-tests.sh @@ -14,7 +14,8 @@ set -euo pipefail # Default values STACK_CONFIG="" PROVIDER="" -TEST_TYPES='["inference"]' +TEST_SUBDIRS="" +TEST_PATTERN="" RUN_VISION_TESTS="false" INFERENCE_MODE="replay" EXTRA_PARAMS="" @@ -27,23 +28,24 @@ Usage: $0 [OPTIONS] Options: --stack-config STRING Stack configuration to use (required) --provider STRING Provider to use (ollama, vllm, etc.) (required) - --test-types JSON JSON array of test types to run (default: '["inference"]') + --test-subdirs STRING Comma-separated list of test subdirectories to run (default: 'inference') --run-vision-tests Run vision tests instead of regular tests --inference-mode STRING Inference mode: record or replay (default: replay) + --test-pattern STRING Regex pattern to pass to pytest -k --help Show this help message Examples: # Basic inference tests with ollama - $0 --stack-config server:ollama --provider ollama + $0 --stack-config server:ci-tests --provider ollama - # Multiple test types with vllm - $0 --stack-config server:vllm --provider vllm --test-types '["inference", "agents"]' + # Multiple test directories with vllm + $0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents' # Vision tests with ollama - $0 --stack-config server:ollama --provider ollama --run-vision-tests + $0 --stack-config server:ci-tests --provider ollama --run-vision-tests # Record mode for updating test recordings - $0 --stack-config server:ollama --provider ollama --inference-mode record + $0 --stack-config server:ci-tests --provider ollama --inference-mode record EOF } @@ -58,8 +60,8 @@ while [[ $# -gt 0 ]]; do PROVIDER="$2" shift 2 ;; - --test-types) - TEST_TYPES="$2" + --test-subdirs) + TEST_SUBDIRS="$2" shift 2 ;; --run-vision-tests) @@ -70,6 +72,10 @@ while [[ $# -gt 0 ]]; do INFERENCE_MODE="$2" shift 2 ;; + --test-pattern) + TEST_PATTERN="$2" + shift 2 + ;; --help) usage exit 0 @@ -99,9 +105,10 @@ fi echo "=== Llama Stack Integration Test Runner ===" echo "Stack Config: $STACK_CONFIG" echo "Provider: $PROVIDER" -echo "Test Types: $TEST_TYPES" +echo "Test Subdirs: $TEST_SUBDIRS" echo "Vision Tests: $RUN_VISION_TESTS" echo "Inference Mode: $INFERENCE_MODE" +echo "Test Pattern: $TEST_PATTERN" echo "" # Check storage and memory before tests @@ -164,17 +171,29 @@ if [[ "$PROVIDER" == "vllm" ]]; then EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls" fi +PYTEST_PATTERN="not( $EXCLUDE_TESTS )" +if [[ -n "$TEST_PATTERN" ]]; then + PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN" +fi + # Run vision tests if specified if [[ "$RUN_VISION_TESTS" == "true" ]]; then echo "Running vision tests..." - if uv run pytest -s -v tests/integration/inference/test_vision_inference.py \ + set +e + uv run pytest -s -v tests/integration/inference/test_vision_inference.py \ --stack-config="$STACK_CONFIG" \ - -k "not( $EXCLUDE_TESTS )" \ + -k "$PYTEST_PATTERN" \ --vision-model=ollama/llama3.2-vision:11b \ --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \ --color=yes $EXTRA_PARAMS \ - --capture=tee-sys | tee pytest-${INFERENCE_MODE}-vision.log; then + --capture=tee-sys + exit_code=$? + set -e + + if [ $exit_code -eq 0 ]; then echo "✅ Vision tests completed successfully" + elif [ $exit_code -eq 5 ]; then + echo "⚠️ No vision tests collected (pattern matched no tests)" else echo "❌ Vision tests failed" exit 1 @@ -183,28 +202,34 @@ if [[ "$RUN_VISION_TESTS" == "true" ]]; then fi # Run regular tests -echo "Test types to run: $TEST_TYPES" +if [[ -z "$TEST_SUBDIRS" ]]; then + TEST_SUBDIRS=$(find tests/integration -maxdepth 1 -mindepth 1 -type d | + sed 's|tests/integration/||' | + grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" | + sort) +fi +echo "Test subdirs to run: $TEST_SUBDIRS" # Collect all test files for the specified test types TEST_FILES="" -for test_type in $(echo "$TEST_TYPES" | jq -r '.[]'); do +for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do # Skip certain test types for vllm provider if [[ "$PROVIDER" == "vllm" ]]; then - if [[ "$test_type" == "safety" ]] || [[ "$test_type" == "post_training" ]] || [[ "$test_type" == "tool_runtime" ]]; then - echo "Skipping $test_type for vllm provider" + if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then + echo "Skipping $test_subdir for vllm provider" continue fi fi - if [[ -d "tests/integration/$test_type" ]]; then + if [[ -d "tests/integration/$test_subdir" ]]; then # Find all Python test files in this directory - test_files=$(find tests/integration/$test_type -name "test_*.py" -o -name "*_test.py") + test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py") if [[ -n "$test_files" ]]; then TEST_FILES="$TEST_FILES $test_files" - echo "Added test files from $test_type: $(echo $test_files | wc -w) files" + echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files" fi else - echo "Warning: Directory tests/integration/$test_type does not exist" + echo "Warning: Directory tests/integration/$test_subdir does not exist" fi done @@ -217,14 +242,21 @@ echo "" echo "=== Running all collected tests in a single pytest command ===" echo "Total test files: $(echo $TEST_FILES | wc -w)" -if uv run pytest -s -v $TEST_FILES \ +set +e +uv run pytest -s -v $TEST_FILES \ --stack-config="$STACK_CONFIG" \ - -k "not( $EXCLUDE_TESTS )" \ + -k "$PYTEST_PATTERN" \ --text-model="$TEXT_MODEL" \ --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \ --color=yes $EXTRA_PARAMS \ - --capture=tee-sys | tee pytest-${INFERENCE_MODE}-all.log; then + --capture=tee-sys +exit_code=$? +set -e + +if [ $exit_code -eq 0 ]; then echo "✅ All tests completed successfully" +elif [ $exit_code -eq 5 ]; then + echo "⚠️ No tests collected (pattern matched no tests)" else echo "❌ Tests failed" exit 1 diff --git a/tests/integration/recordings/index.sqlite b/tests/integration/recordings/index.sqlite index 1951ee7d6..e01c8803a 100644 Binary files a/tests/integration/recordings/index.sqlite and b/tests/integration/recordings/index.sqlite differ diff --git a/tests/integration/recordings/responses/4a3a4447b16b.json b/tests/integration/recordings/responses/4a3a4447b16b.json index 96b40a792..ee1ee6d70 100644 --- a/tests/integration/recordings/responses/4a3a4447b16b.json +++ b/tests/integration/recordings/responses/4a3a4447b16b.json @@ -14,7 +14,7 @@ "models": [ { "model": "nomic-embed-text:latest", - "modified_at": "2025-08-14T20:26:10.795125-07:00", + "modified_at": "2025-08-15T20:24:13.254634Z", "digest": "0a109f422b47e3a30ba2b10eca18548e944e8a23073ee3f3e947efcf3c45e59f", "size": 274302450, "details": { @@ -28,41 +28,9 @@ "quantization_level": "F16" } }, - { - "model": "llama3.2-vision:11b", - "modified_at": "2025-07-30T18:45:02.517873-07:00", - "digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e", - "size": 7816589186, - "details": { - "parent_model": "", - "format": "gguf", - "family": "mllama", - "families": [ - "mllama" - ], - "parameter_size": "10.7B", - "quantization_level": "Q4_K_M" - } - }, - { - "model": "llama3.2-vision:latest", - "modified_at": "2025-07-29T20:18:47.920468-07:00", - "digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e", - "size": 7816589186, - "details": { - "parent_model": "", - "format": "gguf", - "family": "mllama", - "families": [ - "mllama" - ], - "parameter_size": "10.7B", - "quantization_level": "Q4_K_M" - } - }, { "model": "llama-guard3:1b", - "modified_at": "2025-07-25T14:39:44.978630-07:00", + "modified_at": "2025-07-31T04:44:58Z", "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b", "size": 1600181919, "details": { @@ -78,7 +46,7 @@ }, { "model": "all-minilm:l6-v2", - "modified_at": "2025-07-24T15:15:11.129290-07:00", + "modified_at": "2025-07-31T04:42:15Z", "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef", "size": 45960996, "details": { @@ -92,57 +60,9 @@ "quantization_level": "F16" } }, - { - "model": "llama3.2:1b", - "modified_at": "2025-07-17T22:02:24.953208-07:00", - "digest": "baf6a787fdffd633537aa2eb51cfd54cb93ff08e28040095462bb63daf552878", - "size": 1321098329, - "details": { - "parent_model": "", - "format": "gguf", - "family": "llama", - "families": [ - "llama" - ], - "parameter_size": "1.2B", - "quantization_level": "Q8_0" - } - }, - { - "model": "all-minilm:latest", - "modified_at": "2025-06-03T16:50:10.946583-07:00", - "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef", - "size": 45960996, - "details": { - "parent_model": "", - "format": "gguf", - "family": "bert", - "families": [ - "bert" - ], - "parameter_size": "23M", - "quantization_level": "F16" - } - }, - { - "model": "llama3.2:3b", - "modified_at": "2025-05-01T11:15:23.797447-07:00", - "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72", - "size": 2019393189, - "details": { - "parent_model": "", - "format": "gguf", - "family": "llama", - "families": [ - "llama" - ], - "parameter_size": "3.2B", - "quantization_level": "Q4_K_M" - } - }, { "model": "llama3.2:3b-instruct-fp16", - "modified_at": "2025-04-30T15:33:48.939665-07:00", + "modified_at": "2025-07-31T04:42:05Z", "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d", "size": 6433703586, "details": { diff --git a/tests/integration/recordings/responses/561746e1c8de.json b/tests/integration/recordings/responses/561746e1c8de.json index a28366693..120f40661 100644 --- a/tests/integration/recordings/responses/561746e1c8de.json +++ b/tests/integration/recordings/responses/561746e1c8de.json @@ -21,7 +21,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.141947Z", + "created_at": "2025-08-15T20:24:49.18651486Z", "done": false, "done_reason": null, "total_duration": null, @@ -39,7 +39,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.194979Z", + "created_at": "2025-08-15T20:24:49.370611348Z", "done": false, "done_reason": null, "total_duration": null, @@ -57,7 +57,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.248312Z", + "created_at": "2025-08-15T20:24:49.557000029Z", "done": false, "done_reason": null, "total_duration": null, @@ -75,7 +75,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.301911Z", + "created_at": "2025-08-15T20:24:49.746777116Z", "done": false, "done_reason": null, "total_duration": null, @@ -93,7 +93,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.354437Z", + "created_at": "2025-08-15T20:24:49.942233333Z", "done": false, "done_reason": null, "total_duration": null, @@ -111,7 +111,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.406821Z", + "created_at": "2025-08-15T20:24:50.126788846Z", "done": false, "done_reason": null, "total_duration": null, @@ -129,7 +129,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.457633Z", + "created_at": "2025-08-15T20:24:50.311346131Z", "done": false, "done_reason": null, "total_duration": null, @@ -147,7 +147,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.507857Z", + "created_at": "2025-08-15T20:24:50.501507173Z", "done": false, "done_reason": null, "total_duration": null, @@ -165,7 +165,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.558847Z", + "created_at": "2025-08-15T20:24:50.692296777Z", "done": false, "done_reason": null, "total_duration": null, @@ -183,7 +183,7 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.609969Z", + "created_at": "2025-08-15T20:24:50.878846539Z", "done": false, "done_reason": null, "total_duration": null, @@ -201,15 +201,15 @@ "__type__": "ollama._types.GenerateResponse", "__data__": { "model": "llama3.2:3b-instruct-fp16", - "created_at": "2025-08-04T22:55:14.660997Z", + "created_at": "2025-08-15T20:24:51.063200561Z", "done": true, "done_reason": "stop", - "total_duration": 715356542, - "load_duration": 59747500, + "total_duration": 33982453650, + "load_duration": 2909001805, "prompt_eval_count": 341, - "prompt_eval_duration": 128000000, + "prompt_eval_duration": 29194357307, "eval_count": 11, - "eval_duration": 526000000, + "eval_duration": 1878247732, "response": "", "thinking": null, "context": null