feat(tests): migrate to global "setups" system for test configuration

This PR refactors the integration test system from suite-specific "contexts" to global "setups".
This provides better separation of concerns: **suites = what to test, setups = how to configure.**

Key changes:
- New `tests/integration/setups.py` with global, reusable configurations (ollama, vllm, gpt, claude)
- Simplified `tests/integration/suites.py` to only define test collection roots + default setup references
- Updated CLI from `--context` to `--setup` parameter that works with any suite
- Modified `scripts/integration-tests.sh` to use `--test-setup` instead of `--test-context`
- Updated documentation to reflect the new global setup system

Benefits:
- Setups can be reused across multiple suites (e.g., use "gpt" with any suite)
- Clear separation between test selection (suites) and configuration (setups)
- Easier to add new configurations without modifying existing suites
- Centralized configuration management

Usage examples:
- `pytest tests/integration --suite=responses --setup=gpt`
- `pytest tests/integration --suite=vision --setup=ollama`
- `pytest tests/integration --suite=base --setup=vllm`
This commit is contained in:
Ashwin Bharambe 2025-09-08 14:56:08 -07:00
parent 47b640370e
commit c662d8aa31
10 changed files with 272 additions and 178 deletions

View file

@ -5,21 +5,22 @@ inputs:
stack-config: stack-config:
description: 'Stack configuration to use' description: 'Stack configuration to use'
required: true required: true
provider: setup:
description: 'Provider to use for tests' description: 'Setup to use for tests (e.g., ollama, gpt, vllm)'
required: true required: false
default: ''
inference-mode: inference-mode:
description: 'Inference mode (record or replay)' description: 'Inference mode (record or replay)'
required: true required: true
test-suite: suite:
description: 'Test suite to use: base, responses, vision, etc.' description: 'Test suite to use: base, responses, vision, etc.'
required: false required: false
default: '' default: ''
test-subdirs: subdirs:
description: 'Comma-separated list of test subdirectories to run; overrides test-suite' description: 'Comma-separated list of test subdirectories to run; overrides suite'
required: false required: false
default: '' default: ''
test-pattern: pattern:
description: 'Regex pattern to pass to pytest -k' description: 'Regex pattern to pass to pytest -k'
required: false required: false
default: '' default: ''
@ -37,14 +38,23 @@ runs:
- name: Run Integration Tests - name: Run Integration Tests
shell: bash shell: bash
run: | run: |
uv run --no-sync ./scripts/integration-tests.sh \ SCRIPT_ARGS="--stack-config '${{ inputs.stack-config }}' --inference-mode '${{ inputs.inference-mode }}'"
--stack-config '${{ inputs.stack-config }}' \
--provider '${{ inputs.provider }}' \ # Add optional arguments only if they are provided
--test-subdirs '${{ inputs.test-subdirs }}' \ if [ -n '${{ inputs.setup }}' ]; then
--test-pattern '${{ inputs.test-pattern }}' \ SCRIPT_ARGS="$SCRIPT_ARGS --setup '${{ inputs.setup }}'"
--inference-mode '${{ inputs.inference-mode }}' \ fi
--test-suite '${{ inputs.test-suite }}' \ if [ -n '${{ inputs.suite }}' ]; then
| tee pytest-${{ inputs.inference-mode }}.log SCRIPT_ARGS="$SCRIPT_ARGS --suite '${{ inputs.suite }}'"
fi
if [ -n '${{ inputs.subdirs }}' ]; then
SCRIPT_ARGS="$SCRIPT_ARGS --subdirs '${{ inputs.subdirs }}'"
fi
if [ -n '${{ inputs.pattern }}' ]; then
SCRIPT_ARGS="$SCRIPT_ARGS --pattern '${{ inputs.pattern }}'"
fi
uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log
- name: Commit and push recordings - name: Commit and push recordings
@ -58,7 +68,7 @@ runs:
echo "New recordings detected, committing and pushing" echo "New recordings detected, committing and pushing"
git add tests/integration/recordings/ git add tests/integration/recordings/
git commit -m "Recordings update from CI (test-suite: ${{ inputs.test-suite }})" git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
git fetch origin ${{ github.ref_name }} git fetch origin ${{ github.ref_name }}
git rebase origin/${{ github.ref_name }} git rebase origin/${{ github.ref_name }}
echo "Rebased successfully" echo "Rebased successfully"

View file

@ -1,7 +1,7 @@
name: Setup Ollama name: Setup Ollama
description: Start Ollama description: Start Ollama
inputs: inputs:
test-suite: suite:
description: 'Test suite to use: base, responses, vision, etc.' description: 'Test suite to use: base, responses, vision, etc.'
required: false required: false
default: '' default: ''
@ -11,7 +11,7 @@ runs:
- name: Start Ollama - name: Start Ollama
shell: bash shell: bash
run: | run: |
if [ "${{ inputs.test-suite }}" == "vision" ]; then if [ "${{ inputs.suite }}" == "vision" ]; then
image="ollama-with-vision-model" image="ollama-with-vision-model"
else else
image="ollama-with-models" image="ollama-with-models"

View file

@ -8,11 +8,11 @@ inputs:
client-version: client-version:
description: 'Client version (latest or published)' description: 'Client version (latest or published)'
required: true required: true
provider: setup:
description: 'Provider to setup (ollama or vllm)' description: 'Setup to configure (ollama, vllm, gpt, etc.)'
required: true required: false
default: 'ollama' default: 'ollama'
test-suite: suite:
description: 'Test suite to use: base, responses, vision, etc.' description: 'Test suite to use: base, responses, vision, etc.'
required: false required: false
default: '' default: ''
@ -30,13 +30,13 @@ runs:
client-version: ${{ inputs.client-version }} client-version: ${{ inputs.client-version }}
- name: Setup ollama - name: Setup ollama
if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }} if: ${{ (inputs.setup == 'ollama' || inputs.setup == 'ollama-vision') && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-ollama uses: ./.github/actions/setup-ollama
with: with:
test-suite: ${{ inputs.test-suite }} suite: ${{ inputs.suite }}
- name: Setup vllm - name: Setup vllm
if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }} if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-vllm uses: ./.github/actions/setup-vllm
- name: Build Llama Stack - name: Build Llama Stack

View file

@ -28,8 +28,8 @@ on:
description: 'Test against both the latest and published versions' description: 'Test against both the latest and published versions'
type: boolean type: boolean
default: false default: false
test-provider: test-setup:
description: 'Test against a specific provider' description: 'Test against a specific setup'
type: string type: string
default: 'ollama' default: 'ollama'
@ -42,18 +42,18 @@ jobs:
run-replay-mode-tests: run-replay-mode-tests:
runs-on: ubuntu-latest runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.test-suite) }} name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
client-type: [library, server] client-type: [library, server]
# Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama) # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }} setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12 # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }} python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }} client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
test-suite: [base, vision] suite: [base, vision]
steps: steps:
- name: Checkout repository - name: Checkout repository
@ -64,14 +64,14 @@ jobs:
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
client-version: ${{ matrix.client-version }} client-version: ${{ matrix.client-version }}
provider: ${{ matrix.provider }} setup: ${{ matrix.setup }}
test-suite: ${{ matrix.test-suite }} suite: ${{ matrix.suite }}
inference-mode: 'replay' inference-mode: 'replay'
- name: Run tests - name: Run tests
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
with: with:
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }} stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
provider: ${{ matrix.provider }} setup: ${{ matrix.setup }}
inference-mode: 'replay' inference-mode: 'replay'
test-suite: ${{ matrix.test-suite }} suite: ${{ matrix.suite }}

View file

@ -10,19 +10,19 @@ run-name: Run the integration test suite from tests/integration
on: on:
workflow_dispatch: workflow_dispatch:
inputs: inputs:
test-provider: test-setup:
description: 'Test against a specific provider' description: 'Test against a specific setup'
type: string type: string
default: 'ollama' default: 'ollama'
test-suite: suite:
description: 'Test suite to use: base, responses, vision, etc.' description: 'Test suite to use: base, responses, vision, etc.'
type: string type: string
default: '' default: ''
test-subdirs: subdirs:
description: 'Comma-separated list of test subdirectories to run; overrides test-suite' description: 'Comma-separated list of test subdirectories to run; overrides suite'
type: string type: string
default: '' default: ''
test-pattern: pattern:
description: 'Regex pattern to pass to pytest -k' description: 'Regex pattern to pass to pytest -k'
type: string type: string
default: '' default: ''
@ -39,10 +39,10 @@ jobs:
run: | run: |
echo "::group::Workflow Inputs" echo "::group::Workflow Inputs"
echo "branch: ${{ github.ref_name }}" echo "branch: ${{ github.ref_name }}"
echo "test-provider: ${{ inputs.test-provider }}" echo "test-setup: ${{ inputs.test-setup }}"
echo "test-suite: ${{ inputs.test-suite }}" echo "suite: ${{ inputs.suite }}"
echo "test-subdirs: ${{ inputs.test-subdirs }}" echo "subdirs: ${{ inputs.subdirs }}"
echo "test-pattern: ${{ inputs.test-pattern }}" echo "pattern: ${{ inputs.pattern }}"
echo "::endgroup::" echo "::endgroup::"
- name: Checkout repository - name: Checkout repository
@ -55,16 +55,16 @@ jobs:
with: with:
python-version: "3.12" # Use single Python version for recording python-version: "3.12" # Use single Python version for recording
client-version: "latest" client-version: "latest"
provider: ${{ inputs.test-provider || 'ollama' }} setup: ${{ inputs.test-setup || 'ollama' }}
test-suite: ${{ inputs.test-suite }} suite: ${{ inputs.suite }}
inference-mode: 'record' inference-mode: 'record'
- name: Run and record tests - name: Run and record tests
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
with: with:
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
provider: ${{ inputs.test-provider || 'ollama' }} setup: ${{ inputs.test-setup || 'ollama' }}
inference-mode: 'record' inference-mode: 'record'
test-suite: ${{ inputs.test-suite }} suite: ${{ inputs.suite }}
test-subdirs: ${{ inputs.test-subdirs }} subdirs: ${{ inputs.subdirs }}
test-pattern: ${{ inputs.test-pattern }} pattern: ${{ inputs.pattern }}

View file

@ -14,7 +14,7 @@ set -euo pipefail
# Default values # Default values
BRANCH="" BRANCH=""
TEST_SUBDIRS="" TEST_SUBDIRS=""
TEST_PROVIDER="ollama" TEST_SETUP="ollama"
TEST_SUITE="base" TEST_SUITE="base"
TEST_PATTERN="" TEST_PATTERN=""
@ -27,7 +27,7 @@ Trigger the integration test recording workflow remotely. This way you do not ne
OPTIONS: OPTIONS:
-b, --branch BRANCH Branch to run the workflow on (defaults to current branch) -b, --branch BRANCH Branch to run the workflow on (defaults to current branch)
-p, --test-provider PROVIDER Test provider to use: vllm or ollama (default: ollama) -p, --test-setup SETUP Test setup to use: vllm, ollama, gpt, etc. (default: ollama)
-t, --test-suite SUITE Test suite to use: base, responses, vision, etc. (default: base) -t, --test-suite SUITE Test suite to use: base, responses, vision, etc. (default: base)
-s, --test-subdirs DIRS Comma-separated list of test subdirectories to run (overrides suite) -s, --test-subdirs DIRS Comma-separated list of test subdirectories to run (overrides suite)
-k, --test-pattern PATTERN Regex pattern to pass to pytest -k -k, --test-pattern PATTERN Regex pattern to pass to pytest -k
@ -40,8 +40,8 @@ EXAMPLES:
# Record tests for specific branch with vision tests # Record tests for specific branch with vision tests
$0 -b my-feature-branch --test-suite vision $0 -b my-feature-branch --test-suite vision
# Record multiple test subdirectories with specific provider # Record multiple test subdirectories with specific setup
$0 --test-subdirs "agents,inference" --test-provider vllm $0 --test-subdirs "agents,inference" --test-setup vllm
# Record tests matching a specific pattern # Record tests matching a specific pattern
$0 --test-subdirs "inference" --test-pattern "test_streaming" $0 --test-subdirs "inference" --test-pattern "test_streaming"
@ -67,8 +67,8 @@ while [[ $# -gt 0 ]]; do
TEST_SUBDIRS="$2" TEST_SUBDIRS="$2"
shift 2 shift 2
;; ;;
-p|--test-provider) -p|--test-setup)
TEST_PROVIDER="$2" TEST_SETUP="$2"
shift 2 shift 2
;; ;;
-t|--test-suite) -t|--test-suite)
@ -101,13 +101,8 @@ if [[ -z "$TEST_SUBDIRS" && -z "$TEST_SUITE" ]]; then
exit 1 exit 1
fi fi
# Validate test provider # Validate test setup (optional - setups are validated by the workflow itself)
if [[ "$TEST_PROVIDER" != "vllm" && "$TEST_PROVIDER" != "ollama" ]]; then # Common setups: ollama, vllm, gpt, etc.
echo "❌ Error: Invalid test provider '$TEST_PROVIDER'"
echo " Supported providers: vllm, ollama"
echo " Example: $0 --test-subdirs \"agents\" --test-provider vllm"
exit 1
fi
# Check if required tools are installed # Check if required tools are installed
if ! command -v gh &> /dev/null; then if ! command -v gh &> /dev/null; then
@ -237,7 +232,7 @@ fi
# Build the workflow dispatch command # Build the workflow dispatch command
echo "Triggering integration test recording workflow..." echo "Triggering integration test recording workflow..."
echo "Branch: $BRANCH" echo "Branch: $BRANCH"
echo "Test provider: $TEST_PROVIDER" echo "Test setup: $TEST_SETUP"
echo "Test subdirs: $TEST_SUBDIRS" echo "Test subdirs: $TEST_SUBDIRS"
echo "Test suite: $TEST_SUITE" echo "Test suite: $TEST_SUITE"
echo "Test pattern: ${TEST_PATTERN:-"(none)"}" echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
@ -245,16 +240,16 @@ echo ""
# Prepare inputs for gh workflow run # Prepare inputs for gh workflow run
if [[ -n "$TEST_SUBDIRS" ]]; then if [[ -n "$TEST_SUBDIRS" ]]; then
INPUTS="-f test-subdirs='$TEST_SUBDIRS'" INPUTS="-f subdirs='$TEST_SUBDIRS'"
fi fi
if [[ -n "$TEST_PROVIDER" ]]; then if [[ -n "$TEST_SETUP" ]]; then
INPUTS="$INPUTS -f test-provider='$TEST_PROVIDER'" INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"
fi fi
if [[ -n "$TEST_SUITE" ]]; then if [[ -n "$TEST_SUITE" ]]; then
INPUTS="$INPUTS -f test-suite='$TEST_SUITE'" INPUTS="$INPUTS -f suite='$TEST_SUITE'"
fi fi
if [[ -n "$TEST_PATTERN" ]]; then if [[ -n "$TEST_PATTERN" ]]; then
INPUTS="$INPUTS -f test-pattern='$TEST_PATTERN'" INPUTS="$INPUTS -f pattern='$TEST_PATTERN'"
fi fi
# Run the workflow # Run the workflow

View file

@ -13,10 +13,10 @@ set -euo pipefail
# Default values # Default values
STACK_CONFIG="" STACK_CONFIG=""
PROVIDER="" TEST_SUITE="base"
TEST_SETUP=""
TEST_SUBDIRS="" TEST_SUBDIRS=""
TEST_PATTERN="" TEST_PATTERN=""
TEST_SUITE="base"
INFERENCE_MODE="replay" INFERENCE_MODE="replay"
EXTRA_PARAMS="" EXTRA_PARAMS=""
@ -27,29 +27,30 @@ Usage: $0 [OPTIONS]
Options: Options:
--stack-config STRING Stack configuration to use (required) --stack-config STRING Stack configuration to use (required)
--provider STRING Provider to use (ollama, vllm, etc.) (required) --suite STRING Test suite to run (default: 'base')
--test-suite STRING Comma-separated list of test suites to run (default: 'base') --setup STRING Test setup (models, env) to use (e.g., 'ollama', 'ollama-vision', 'gpt', 'vllm')
--inference-mode STRING Inference mode: record or replay (default: replay) --inference-mode STRING Inference mode: record or replay (default: replay)
--test-subdirs STRING Comma-separated list of test subdirectories to run (overrides suite) --subdirs STRING Comma-separated list of test subdirectories to run (overrides suite)
--test-pattern STRING Regex pattern to pass to pytest -k --pattern STRING Regex pattern to pass to pytest -k
--help Show this help message --help Show this help message
Suites are defined in tests/integration/suites.py. They are used to narrow the collection of tests and provide default model options. Suites are defined in tests/integration/suites.py and define which tests to run.
Setups are defined in tests/integration/setups.py and provide global configuration (models, env).
You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite. You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.
Examples: Examples:
# Basic inference tests with ollama # Basic inference tests with ollama
$0 --stack-config server:ci-tests --provider ollama $0 --stack-config server:ci-tests --suite base --setup ollama
# Multiple test directories with vllm # Multiple test directories with vllm
$0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents' $0 --stack-config server:ci-tests --subdirs 'inference,agents' --setup vllm
# Vision tests with ollama # Vision tests with ollama
$0 --stack-config server:ci-tests --provider ollama --test-suite vision $0 --stack-config server:ci-tests --suite vision # default setup for this suite is ollama-vision
# Record mode for updating test recordings # Record mode for updating test recordings
$0 --stack-config server:ci-tests --provider ollama --inference-mode record $0 --stack-config server:ci-tests --suite base --inference-mode record
EOF EOF
} }
@ -60,15 +61,15 @@ while [[ $# -gt 0 ]]; do
STACK_CONFIG="$2" STACK_CONFIG="$2"
shift 2 shift 2
;; ;;
--provider) --setup)
PROVIDER="$2" TEST_SETUP="$2"
shift 2 shift 2
;; ;;
--test-subdirs) --subdirs)
TEST_SUBDIRS="$2" TEST_SUBDIRS="$2"
shift 2 shift 2
;; ;;
--test-suite) --suite)
TEST_SUITE="$2" TEST_SUITE="$2"
shift 2 shift 2
;; ;;
@ -76,7 +77,7 @@ while [[ $# -gt 0 ]]; do
INFERENCE_MODE="$2" INFERENCE_MODE="$2"
shift 2 shift 2
;; ;;
--test-pattern) --pattern)
TEST_PATTERN="$2" TEST_PATTERN="$2"
shift 2 shift 2
;; ;;
@ -96,11 +97,13 @@ done
# Validate required parameters # Validate required parameters
if [[ -z "$STACK_CONFIG" ]]; then if [[ -z "$STACK_CONFIG" ]]; then
echo "Error: --stack-config is required" echo "Error: --stack-config is required"
usage
exit 1 exit 1
fi fi
if [[ -z "$PROVIDER" ]]; then if [[ -z "$TEST_SETUP" && -n "$TEST_SUBDIRS" ]]; then
echo "Error: --provider is required" echo "Error: --test-setup is required when --test-subdirs is provided"
usage
exit 1 exit 1
fi fi
@ -111,7 +114,7 @@ fi
echo "=== Llama Stack Integration Test Runner ===" echo "=== Llama Stack Integration Test Runner ==="
echo "Stack Config: $STACK_CONFIG" echo "Stack Config: $STACK_CONFIG"
echo "Provider: $PROVIDER" echo "Setup: $TEST_SETUP"
echo "Inference Mode: $INFERENCE_MODE" echo "Inference Mode: $INFERENCE_MODE"
echo "Test Suite: $TEST_SUITE" echo "Test Suite: $TEST_SUITE"
echo "Test Subdirs: $TEST_SUBDIRS" echo "Test Subdirs: $TEST_SUBDIRS"
@ -129,18 +132,10 @@ echo ""
# Set environment variables # Set environment variables
export LLAMA_STACK_CLIENT_TIMEOUT=300 export LLAMA_STACK_CLIENT_TIMEOUT=300
export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
# Configure provider-specific settings # Setup-specific configuration is now handled by pytest via --setup
if [[ "$PROVIDER" == "ollama" ]]; then if [[ -n "$TEST_SETUP" ]]; then
export OLLAMA_URL="http://0.0.0.0:11434" EXTRA_PARAMS="--setup=$TEST_SETUP"
export TEXT_MODEL="ollama/llama3.2:3b-instruct-fp16"
export SAFETY_MODEL="ollama/llama-guard3:1b"
EXTRA_PARAMS="--safety-shield=llama-guard"
else
export VLLM_URL="http://localhost:8000/v1"
export TEXT_MODEL="vllm/meta-llama/Llama-3.2-1B-Instruct"
EXTRA_PARAMS=""
fi fi
THIS_DIR=$(dirname "$0") THIS_DIR=$(dirname "$0")
@ -191,8 +186,8 @@ fi
echo "=== Running Integration Tests ===" echo "=== Running Integration Tests ==="
EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag" EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
# Additional exclusions for vllm provider # Additional exclusions for vllm setup
if [[ "$PROVIDER" == "vllm" ]]; then if [[ "$TEST_SETUP" == "vllm" ]]; then
EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls" EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
fi fi
@ -229,21 +224,23 @@ if [[ -n "$TEST_SUBDIRS" ]]; then
echo "Total test files: $(echo $TEST_FILES | wc -w)" echo "Total test files: $(echo $TEST_FILES | wc -w)"
PYTEST_TARGET="$TEST_FILES" PYTEST_TARGET="$TEST_FILES"
EXTRA_PARAMS="$EXTRA_PARAMS --text-model=$TEXT_MODEL --embedding-model=sentence-transformers/all-MiniLM-L6-v2"
else else
PYTEST_TARGET="tests/integration/" PYTEST_TARGET="tests/integration/"
EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE" EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE"
fi fi
set +e set +e
set -x
pytest -s -v $PYTEST_TARGET \ pytest -s -v $PYTEST_TARGET \
--stack-config="$STACK_CONFIG" \ --stack-config="$STACK_CONFIG" \
--inference-mode="$INFERENCE_MODE" \
-k "$PYTEST_PATTERN" \ -k "$PYTEST_PATTERN" \
$EXTRA_PARAMS \ $EXTRA_PARAMS \
--color=yes \ --color=yes \
--capture=tee-sys --capture=tee-sys
exit_code=$? exit_code=$?
set -e set -e
set +x
if [ $exit_code -eq 0 ]; then if [ $exit_code -eq 0 ]; then
echo "✅ All tests completed successfully" echo "✅ All tests completed successfully"

View file

@ -6,8 +6,6 @@ Integration tests verify complete workflows across different providers using Lla
```bash ```bash
# Run all integration tests with existing recordings # Run all integration tests with existing recordings
LLAMA_STACK_TEST_INFERENCE_MODE=replay \
LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
uv run --group test \ uv run --group test \
pytest -sv tests/integration/ --stack-config=starter pytest -sv tests/integration/ --stack-config=starter
``` ```
@ -42,25 +40,35 @@ Model parameters can be influenced by the following options:
Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
if no model is specified. if no model is specified.
### Suites (fast selection + sane defaults) ### Suites and Setups
- `--suite`: comma-separated list of named suites that both narrow which tests are collected and prefill common model options (unless you pass them explicitly). - `--suite`: single named suite that narrows which tests are collected.
- Available suites: - Available suites:
- `responses`: collects tests under `tests/integration/responses`; this is a separate suite because it needs a strong tool-calling model. - `base`: collects most tests (excludes responses and post_training)
- `vision`: collects only `tests/integration/inference/test_vision_inference.py`; defaults `--vision-model=ollama/llama3.2-vision:11b`, `--embedding-model=sentence-transformers/all-MiniLM-L6-v2`. - `responses`: collects tests under `tests/integration/responses` (needs strong tool-calling models)
- Explicit flags always win. For example, `--suite=responses --text-model=<X>` overrides the suites text model. - `vision`: collects only `tests/integration/inference/test_vision_inference.py`
- `--setup`: global configuration that can be used with any suite. Setups prefill model/env defaults; explicit CLI flags always win.
- Available setups:
- `ollama`: Local Ollama provider with lightweight models (sets OLLAMA_URL, uses llama3.2:3b-instruct-fp16)
- `vllm`: VLLM provider for efficient local inference (sets VLLM_URL, uses Llama-3.2-1B-Instruct)
- `gpt`: OpenAI GPT models for high-quality responses (uses gpt-4o)
- `claude`: Anthropic Claude models for high-quality responses (uses claude-3-5-sonnet)
Examples: Examples
```bash ```bash
# Fast responses run with defaults # Fast responses run with a strong tool-calling model
pytest -s -v tests/integration --stack-config=server:starter --suite=responses pytest -s -v tests/integration --stack-config=server:starter --suite=responses --setup=gpt
# Fast single-file vision run with defaults # Fast single-file vision run with Ollama defaults
pytest -s -v tests/integration --stack-config=server:starter --suite=vision pytest -s -v tests/integration --stack-config=server:starter --suite=vision --setup=ollama
# Combine suites and override a default # Base suite with VLLM for performance
pytest -s -v tests/integration --stack-config=server:starter --suite=responses,vision --embedding-model=text-embedding-3-small pytest -s -v tests/integration --stack-config=server:starter --suite=base --setup=vllm
# Override a default from setup
pytest -s -v tests/integration --stack-config=server:starter \
--suite=responses --setup=gpt --embedding-model=text-embedding-3-small
``` ```
## Examples ## Examples
@ -127,14 +135,13 @@ pytest tests/integration/
### RECORD Mode ### RECORD Mode
Captures API interactions for later replay: Captures API interactions for later replay:
```bash ```bash
LLAMA_STACK_TEST_INFERENCE_MODE=record \ pytest tests/integration/inference/test_new_feature.py --inference-mode=record
pytest tests/integration/inference/test_new_feature.py
``` ```
### LIVE Mode ### LIVE Mode
Tests make real API calls (but not recorded): Tests make real API calls (but not recorded):
```bash ```bash
LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/ pytest tests/integration/ --inference-mode=live
``` ```
By default, the recording directory is `tests/integration/recordings`. You can override this by setting the `LLAMA_STACK_TEST_RECORDING_DIR` environment variable. By default, the recording directory is `tests/integration/recordings`. You can override this by setting the `LLAMA_STACK_TEST_RECORDING_DIR` environment variable.
@ -162,8 +169,7 @@ See the [main testing guide](../README.md#remote-re-recording-recommended) for f
#### Local Re-recording #### Local Re-recording
```bash ```bash
# Re-record specific tests # Re-record specific tests
LLAMA_STACK_TEST_INFERENCE_MODE=record \ pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py --inference-mode=record
pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py
``` ```
Note that when re-recording tests, you must use a Stack pointing to a server (i.e., `server:starter`). This subtlety exists because the set of tests run in server are a superset of the set of tests run in the library client. Note that when re-recording tests, you must use a Stack pointing to a server (i.e., `server:starter`). This subtlety exists because the set of tests run in server are a superset of the set of tests run in the library client.

View file

@ -15,7 +15,7 @@ from dotenv import load_dotenv
from llama_stack.log import get_logger from llama_stack.log import get_logger
from .suites import SUITE_DEFINITIONS from .suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
logger = get_logger(__name__, category="tests") logger = get_logger(__name__, category="tests")
@ -63,19 +63,33 @@ def pytest_configure(config):
key, value = env_var.split("=", 1) key, value = env_var.split("=", 1)
os.environ[key] = value os.environ[key] = value
suites_raw = config.getoption("--suite") inference_mode = config.getoption("--inference-mode")
suites: list[str] = [] os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = inference_mode
if suites_raw:
suites = [p.strip() for p in str(suites_raw).split(",") if p.strip()] suite = config.getoption("--suite")
unknown = [p for p in suites if p not in SUITE_DEFINITIONS] if suite:
if unknown: if suite not in SUITE_DEFINITIONS:
raise pytest.UsageError(f"Unknown suite: {suite}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}")
# Apply setups (global parameterizations): env + defaults
setup = config.getoption("--setup")
if suite and not setup:
setup = SUITE_DEFINITIONS[suite].default_setup
if setup:
if setup not in SETUP_DEFINITIONS:
raise pytest.UsageError( raise pytest.UsageError(
f"Unknown suite(s): {', '.join(unknown)}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}" f"Unknown setup '{setup}'. Available: {', '.join(sorted(SETUP_DEFINITIONS.keys()))}"
) )
for suite in suites:
suite_def = SUITE_DEFINITIONS.get(suite, {}) setup_obj = SETUP_DEFINITIONS[setup]
defaults: dict = suite_def.get("defaults", {}) logger.info(f"Applying setup '{setup}'{' for suite ' + suite if suite else ''}")
for dest, value in defaults.items(): # Apply env first
for k, v in setup_obj.env.items():
if k not in os.environ:
os.environ[k] = str(v)
# Apply defaults if not provided explicitly
for dest, value in setup_obj.defaults.items():
current = getattr(config.option, dest, None) current = getattr(config.option, dest, None)
if not current: if not current:
setattr(config.option, dest, value) setattr(config.option, dest, value)
@ -120,6 +134,13 @@ def pytest_addoption(parser):
default=384, default=384,
help="Output dimensionality of the embedding model to use for testing. Default: 384", help="Output dimensionality of the embedding model to use for testing. Default: 384",
) )
parser.addoption(
"--inference-mode",
help="Inference mode: { record, replay, live } (default: replay)",
choices=["record", "replay", "live"],
default="replay",
)
parser.addoption( parser.addoption(
"--report", "--report",
help="Path where the test report should be written, e.g. --report=/path/to/report.md", help="Path where the test report should be written, e.g. --report=/path/to/report.md",
@ -127,14 +148,18 @@ def pytest_addoption(parser):
available_suites = ", ".join(sorted(SUITE_DEFINITIONS.keys())) available_suites = ", ".join(sorted(SUITE_DEFINITIONS.keys()))
suite_help = ( suite_help = (
"Comma-separated integration test suites to narrow collection and prefill defaults. " f"Single test suite to run (narrows collection). Available: {available_suites}. Example: --suite=responses"
"Available: "
f"{available_suites}. "
"Explicit CLI flags (e.g., --text-model) override suite defaults. "
"Examples: --suite=responses or --suite=responses,vision."
) )
parser.addoption("--suite", help=suite_help) parser.addoption("--suite", help=suite_help)
# Global setups for any suite
available_setups = ", ".join(sorted(SETUP_DEFINITIONS.keys()))
setup_help = (
f"Global test setup configuration. Available: {available_setups}. "
"Can be used with any suite. Example: --setup=ollama"
)
parser.addoption("--setup", help=setup_help)
MODEL_SHORT_IDS = { MODEL_SHORT_IDS = {
"meta-llama/Llama-3.2-3B-Instruct": "3B", "meta-llama/Llama-3.2-3B-Instruct": "3B",
@ -221,16 +246,12 @@ pytest_plugins = ["tests.integration.fixtures.common"]
def pytest_ignore_collect(path: str, config: pytest.Config) -> bool: def pytest_ignore_collect(path: str, config: pytest.Config) -> bool:
"""Skip collecting paths outside the selected suite roots for speed.""" """Skip collecting paths outside the selected suite roots for speed."""
suites_raw = config.getoption("--suite") suite = config.getoption("--suite")
if not suites_raw: if not suite:
return False return False
names = [p.strip() for p in str(suites_raw).split(",") if p.strip()] sobj = SUITE_DEFINITIONS.get(suite)
roots: list[str] = [] roots: list[str] = sobj.get("roots", []) if isinstance(sobj, dict) else getattr(sobj, "roots", [])
for name in names:
suite_def = SUITE_DEFINITIONS.get(name)
if suite_def:
roots.extend(suite_def.get("roots", []))
if not roots: if not roots:
return False return False

View file

@ -8,46 +8,111 @@
# For example: # For example:
# #
# ```bash # ```bash
# pytest tests/integration/ --suite=vision # pytest tests/integration/ --suite=vision --setup=ollama
# ``` # ```
# #
# Each suite can: """
# - restrict collection to specific roots (dirs or files) Each suite defines what to run (roots). Suites can be run with different global setups defined in setups.py.
# - provide default CLI option values (e.g. text_model, embedding_model, etc.) Setups provide environment variables and model defaults that can be reused across multiple suites.
CLI examples:
pytest tests/integration --suite=responses --setup=gpt
pytest tests/integration --suite=vision --setup=ollama
pytest tests/integration --suite=base --setup=vllm
"""
from pathlib import Path from pathlib import Path
from pydantic import BaseModel, Field
this_dir = Path(__file__).parent this_dir = Path(__file__).parent
default_roots = [
class Suite(BaseModel):
name: str
roots: list[str]
default_setup: str | None = None
class Setup(BaseModel):
"""A reusable test configuration with environment and CLI defaults."""
name: str
description: str
defaults: dict[str, str] = Field(default_factory=dict)
env: dict[str, str] = Field(default_factory=dict)
# Global setups - can be used with any suite "technically" but in reality, some setups might work
# only for specific test suites.
SETUP_DEFINITIONS: dict[str, Setup] = {
"ollama": Setup(
name="ollama",
description="Local Ollama provider with text + safety models",
env={
"OLLAMA_URL": "http://0.0.0.0:11434",
},
defaults={
"text_model": "ollama/llama3.2:3b-instruct-fp16",
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
"safety_model": "ollama/llama-guard3:1b",
"safety_shield": "llama-guard",
},
),
"ollama-vision": Setup(
name="ollama",
description="Local Ollama provider with a vision model",
env={
"OLLAMA_URL": "http://0.0.0.0:11434",
},
defaults={
"vision_model": "ollama/llama3.2-vision:11b",
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
},
),
"vllm": Setup(
name="vllm",
description="vLLM provider with a text model",
env={
"VLLM_URL": "http://localhost:8000/v1",
},
defaults={
"text_model": "vllm/meta-llama/Llama-3.2-1B-Instruct",
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
},
),
"gpt": Setup(
name="gpt",
description="OpenAI GPT models for high-quality responses and tool calling",
defaults={
"text_model": "openai/gpt-4o",
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
},
),
}
base_roots = [
str(p) str(p)
for p in this_dir.glob("*") for p in this_dir.glob("*")
if p.is_dir() if p.is_dir()
and p.name not in ("__pycache__", "fixtures", "test_cases", "recordings", "responses", "post_training") and p.name not in ("__pycache__", "fixtures", "test_cases", "recordings", "responses", "post_training")
] ]
SUITE_DEFINITIONS: dict[str, dict] = { SUITE_DEFINITIONS: dict[str, Suite] = {
"base": { "base": Suite(
"description": "Base suite that includes most tests but runs them with a text Ollama model", name="base",
"roots": default_roots, roots=base_roots,
"defaults": { default_setup="ollama",
"text_model": "ollama/llama3.2:3b-instruct-fp16", ),
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2", "responses": Suite(
}, name="responses",
}, roots=["tests/integration/responses"],
"responses": { default_setup="gpt",
"description": "Suite that includes only the OpenAI Responses tests; needs a strong tool-calling model", ),
"roots": ["tests/integration/responses"], "vision": Suite(
"defaults": { name="vision",
"text_model": "openai/gpt-4o", roots=["tests/integration/inference/test_vision_inference.py"],
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2", default_setup="ollama-vision",
}, ),
},
"vision": {
"description": "Suite that includes only the vision tests",
"roots": ["tests/integration/inference/test_vision_inference.py"],
"defaults": {
"vision_model": "ollama/llama3.2-vision:11b",
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
},
},
} }