# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. """ Test Report Generator Requirements: pip install pytest-json-report Usage: # Generate a report using existing test results python tests/verifications/generate_report.py # Run tests and generate a report python tests/verifications/generate_report.py --run-tests # Run tests for specific providers python tests/verifications/generate_report.py --run-tests --providers fireworks openai # Save the report to a custom location python tests/verifications/generate_report.py --output custom_report.md # Clean up old test result files python tests/verifications/generate_report.py --cleanup """ import argparse import json import os import re import subprocess import time from collections import defaultdict from pathlib import Path # Define the root directory for test results RESULTS_DIR = Path(__file__).parent / "test_results" RESULTS_DIR.mkdir(exist_ok=True) # Maximum number of test result files to keep per provider MAX_RESULTS_PER_PROVIDER = 1 # Custom order of providers PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"] # Dictionary to store providers and their models (will be populated dynamically) PROVIDERS = defaultdict(set) # Tests will be dynamically extracted from results ALL_TESTS = set() def run_tests(provider): """Run pytest for a specific provider and save results""" print(f"Running tests for provider: {provider}") timestamp = int(time.time()) result_file = RESULTS_DIR / f"{provider}_{timestamp}.json" temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json" # Run pytest with JSON output cmd = [ "python", "-m", "pytest", "tests/verifications/openai/test_chat_completion.py", f"--provider={provider}", "-v", "--json-report", f"--json-report-file={temp_json_file}", ] try: result = subprocess.run(cmd, capture_output=True, text=True) print(f"Pytest exit code: {result.returncode}") # Check if the JSON file was created if temp_json_file.exists(): # Read the JSON file and save it to our results format with open(temp_json_file, "r") as f: test_results = json.load(f) # Save results to our own format with a trailing newline with open(result_file, "w") as f: json.dump(test_results, f, indent=2) f.write("\n") # Add a trailing newline for precommit # Clean up temp file temp_json_file.unlink() print(f"Test results saved to {result_file}") return result_file else: print(f"Error: JSON report file not created for {provider}") print(f"Command stdout: {result.stdout}") print(f"Command stderr: {result.stderr}") return None except Exception as e: print(f"Error running tests for {provider}: {e}") return None def parse_results(result_file): """Parse the test results file and extract pass/fail by model and test""" if not os.path.exists(result_file): print(f"Results file does not exist: {result_file}") return {} with open(result_file, "r") as f: results = json.load(f) # Initialize results dictionary parsed_results = defaultdict(lambda: defaultdict(dict)) provider = os.path.basename(result_file).split("_")[0] # Debug: Print summary of test results print(f"Test results summary for {provider}:") print(f"Total tests: {results.get('summary', {}).get('total', 0)}") print(f"Passed: {results.get('summary', {}).get('passed', 0)}") print(f"Failed: {results.get('summary', {}).get('failed', 0)}") print(f"Error: {results.get('summary', {}).get('error', 0)}") print(f"Skipped: {results.get('summary', {}).get('skipped', 0)}") # Extract test results if "tests" not in results or not results["tests"]: print(f"No test results found in {result_file}") return parsed_results # Map for normalizing model names model_name_map = { "Llama-3.3-8B-Instruct": "Llama-3.3-8B-Instruct", "Llama-3.3-70B-Instruct": "Llama-3.3-70B-Instruct", "Llama-3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct", "Llama-4-Scout-17B-16E": "Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct": "Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E": "Llama-4-Maverick-17B-128E-Instruct", "Llama-4-Maverick-17B-128E-Instruct": "Llama-4-Maverick-17B-128E-Instruct", "gpt-4o": "gpt-4o", "gpt-4o-mini": "gpt-4o-mini", } # Keep track of all models found for this provider provider_models = set() # Track all unique test cases for each base test test_case_counts = defaultdict(int) # First pass: count the number of cases for each test for test in results["tests"]: test_id = test.get("nodeid", "") if "call" in test: test_name = test_id.split("::")[1].split("[")[0] input_output_match = re.search(r"\[input_output(\d+)-", test_id) if input_output_match: test_case_counts[test_name] += 1 # Second pass: process the tests with case numbers only for tests with multiple cases for test in results["tests"]: test_id = test.get("nodeid", "") outcome = test.get("outcome", "") # Only process tests that have been executed (not setup errors) if "call" in test: # Regular test that actually ran test_name = test_id.split("::")[1].split("[")[0] # Extract input_output parameter to differentiate between test cases input_output_match = re.search(r"\[input_output(\d+)-", test_id) input_output_index = input_output_match.group(1) if input_output_match else "" # Create a more detailed test name with case number only if there are multiple cases detailed_test_name = test_name if input_output_index and test_case_counts[test_name] > 1: detailed_test_name = f"{test_name} (case {input_output_index})" # Track all unique test names ALL_TESTS.add(detailed_test_name) # Extract model name from test_id using a more robust pattern model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id) if model_match: raw_model = model_match.group(1) model = model_name_map.get(raw_model, raw_model) # Add to set of known models for this provider provider_models.add(model) # Also update the global PROVIDERS dictionary PROVIDERS[provider].add(model) # Store the result if outcome == "passed": parsed_results[provider][model][detailed_test_name] = True else: parsed_results[provider][model][detailed_test_name] = False print(f"Parsed test result: {detailed_test_name} for model {model}: {outcome}") elif outcome == "error" and "setup" in test and test.get("setup", {}).get("outcome") == "failed": # This is a setup failure, which likely means a configuration issue # Extract the base test name and model name parts = test_id.split("::") if len(parts) > 1: test_name = parts[1].split("[")[0] # Extract input_output parameter to differentiate between test cases input_output_match = re.search(r"\[input_output(\d+)-", test_id) input_output_index = input_output_match.group(1) if input_output_match else "" # Create a more detailed test name with case number only if there are multiple cases detailed_test_name = test_name if input_output_index and test_case_counts[test_name] > 1: detailed_test_name = f"{test_name} (case {input_output_index})" if detailed_test_name in ALL_TESTS: # Use a more robust pattern for model extraction model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id) if model_match: raw_model = model_match.group(1) model = model_name_map.get(raw_model, raw_model) # Add to set of known models for this provider provider_models.add(model) # Also update the global PROVIDERS dictionary PROVIDERS[provider].add(model) # Mark setup failures as false (failed) parsed_results[provider][model][detailed_test_name] = False print(f"Parsed setup failure: {detailed_test_name} for model {model}") # Debug: Print parsed results if not parsed_results[provider]: print(f"Warning: No test results parsed for provider {provider}") else: for model, tests in parsed_results[provider].items(): print(f"Model {model}: {len(tests)} test results") return parsed_results def cleanup_old_results(): """Clean up old test result files, keeping only the newest N per provider""" for provider in PROVIDERS.keys(): # Get all result files for this provider provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json")) # Sort by timestamp (newest first) provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True) # Remove old files beyond the max to keep if len(provider_files) > MAX_RESULTS_PER_PROVIDER: for old_file in provider_files[MAX_RESULTS_PER_PROVIDER:]: try: old_file.unlink() print(f"Removed old result file: {old_file}") except Exception as e: print(f"Error removing file {old_file}: {e}") def get_latest_results_by_provider(): """Get the latest test result file for each provider""" provider_results = {} # Get all result files result_files = list(RESULTS_DIR.glob("*.json")) # Extract all provider names from filenames all_providers = set() for file in result_files: # File format is provider_timestamp.json parts = file.stem.split("_") if len(parts) >= 2: all_providers.add(parts[0]) # Group by provider for provider in all_providers: provider_files = [f for f in result_files if f.name.startswith(f"{provider}_")] # Sort by timestamp (newest first) provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True) if provider_files: provider_results[provider] = provider_files[0] return provider_results def generate_report(results_dict, output_file=None): """Generate the markdown report""" if output_file is None: # Default to creating the report in the same directory as this script output_file = Path(__file__).parent / "REPORT.md" else: output_file = Path(output_file) # Get the timestamp from result files provider_timestamps = {} provider_results = get_latest_results_by_provider() for provider, result_file in provider_results.items(): # Extract timestamp from filename (format: provider_timestamp.json) try: timestamp_str = result_file.stem.split("_")[1] timestamp = int(timestamp_str) formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) provider_timestamps[provider] = formatted_time except (IndexError, ValueError): provider_timestamps[provider] = "Unknown" # Convert provider model sets to sorted lists for provider in PROVIDERS: PROVIDERS[provider] = sorted(PROVIDERS[provider]) # Sort tests alphabetically sorted_tests = sorted(ALL_TESTS) report = ["# Test Results Report\n"] report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n") report.append("*This report was generated by running `python tests/verifications/generate_report.py`*\n") # Icons for pass/fail pass_icon = "✅" fail_icon = "❌" na_icon = "⚪" # Add emoji legend report.append("## Legend\n") report.append(f"- {pass_icon} - Test passed") report.append(f"- {fail_icon} - Test failed") report.append(f"- {na_icon} - Test not applicable or not run for this model") report.append("\n") # Add a summary section report.append("## Summary\n") # Count total tests and passes total_tests = 0 passed_tests = 0 provider_totals = {} # Prepare summary data for provider in PROVIDERS.keys(): provider_passed = 0 provider_total = 0 if provider in results_dict: provider_models = PROVIDERS[provider] for model in provider_models: if model in results_dict[provider]: model_results = results_dict[provider][model] for test in sorted_tests: if test in model_results: provider_total += 1 total_tests += 1 if model_results[test]: provider_passed += 1 passed_tests += 1 provider_totals[provider] = (provider_passed, provider_total) # Add summary table report.append("| Provider | Pass Rate | Tests Passed | Total Tests |") report.append("| --- | --- | --- | --- |") # Use the custom order for summary table for provider in [p for p in PROVIDER_ORDER if p in PROVIDERS]: passed, total = provider_totals.get(provider, (0, 0)) pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A" report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |") # Add providers not in the custom order for provider in [p for p in PROVIDERS if p not in PROVIDER_ORDER]: passed, total = provider_totals.get(provider, (0, 0)) pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A" report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |") report.append("\n") # Process each provider in the custom order, then any additional providers for provider in sorted( PROVIDERS.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p) ): if not PROVIDERS[provider]: # Skip providers with no models continue report.append(f"\n## {provider.capitalize()}\n") # Add timestamp when test was run if provider in provider_timestamps: report.append(f"*Tests run on: {provider_timestamps[provider]}*\n") # Add test command for reproducing results test_cmd = f"pytest tests/verifications/openai/test_chat_completion.py --provider={provider} -v" report.append(f"```bash\n{test_cmd}\n```\n") # Get the relevant models for this provider provider_models = PROVIDERS[provider] # Create table header with models as columns header = "| Test | " + " | ".join(provider_models) + " |" separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |" report.append(header) report.append(separator) # Get results for this provider provider_results = results_dict.get(provider, {}) # Add rows for each test for test in sorted_tests: row = f"| {test} |" # Add results for each model in this test for model in provider_models: if model in provider_results and test in provider_results[model]: result = pass_icon if provider_results[model][test] else fail_icon else: result = na_icon row += f" {result} |" report.append(row) # Write to file with open(output_file, "w") as f: f.write("\n".join(report)) f.write("\n") print(f"Report generated: {output_file}") def main(): parser = argparse.ArgumentParser(description="Generate test report") parser.add_argument("--run-tests", action="store_true", help="Run tests before generating report") parser.add_argument( "--providers", type=str, nargs="+", help="Specify providers to test (comma-separated or space-separated, default: all)", ) parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)") args = parser.parse_args() all_results = {} if args.run_tests: # Get list of available providers from command line or use detected providers if args.providers: # Handle both comma-separated and space-separated lists test_providers = [] for provider_arg in args.providers: # Split by comma if commas are present if "," in provider_arg: test_providers.extend(provider_arg.split(",")) else: test_providers.append(provider_arg) else: # Default providers to test test_providers = PROVIDER_ORDER for provider in test_providers: provider = provider.strip() # Remove any whitespace result_file = run_tests(provider) if result_file: provider_results = parse_results(result_file) all_results.update(provider_results) else: # Use existing results provider_result_files = get_latest_results_by_provider() for result_file in provider_result_files.values(): provider_results = parse_results(result_file) all_results.update(provider_results) # Generate the report generate_report(all_results, args.output) cleanup_old_results() if __name__ == "__main__": main()