chore(verification): update README and reorganize generate_report.py (#1978)

# What does this PR do?


## Test Plan
uv run --with-editable ".[dev]" python
tests/verifications/generate_report.py --run-tests
This commit is contained in:
ehhuang 2025-04-17 10:41:22 -07:00 committed by GitHub
parent cb874287a4
commit 8bd6665775
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 1205 additions and 1213 deletions

View file

@ -46,6 +46,7 @@ dev = [
"pytest-asyncio",
"pytest-cov",
"pytest-html",
"pytest-json-report",
"nbval", # For notebook testing
"black",
"ruff",

View file

@ -8,29 +8,44 @@ This framework allows you to run the same set of verification tests against diff
## Features
The verification suite currently tests:
The verification suite currently tests the following in both streaming and non-streaming modes:
- Basic chat completions (streaming and non-streaming)
- Basic chat completions
- Image input capabilities
- Structured JSON output formatting
- Tool calling functionality
## Report
The lastest report can be found at [REPORT.md](REPORT.md).
To update the report, ensure you have the API keys set,
```bash
export OPENAI_API_KEY=<your_openai_api_key>
export FIREWORKS_API_KEY=<your_fireworks_api_key>
export TOGETHER_API_KEY=<your_together_api_key>
```
then run
```bash
uv run --with-editable ".[dev]" python tests/verifications/generate_report.py --run-tests
```
## Running Tests
To run the verification tests, use pytest with the following parameters:
```bash
cd llama-stack
pytest tests/verifications/openai --provider=<provider-name>
pytest tests/verifications/openai_api --provider=<provider-name>
```
Example:
```bash
# Run all tests
pytest tests/verifications/openai --provider=together
pytest tests/verifications/openai_api --provider=together
# Only run tests with Llama 4 models
pytest tests/verifications/openai --provider=together -k 'Llama-4'
pytest tests/verifications/openai_api --provider=together -k 'Llama-4'
```
### Parameters
@ -41,23 +56,22 @@ pytest tests/verifications/openai --provider=together -k 'Llama-4'
## Supported Providers
The verification suite currently supports:
- OpenAI
- Fireworks
- Together
- Groq
- Cerebras
The verification suite supports any provider with an OpenAI compatible endpoint.
See `tests/verifications/conf/` for the list of supported providers.
To run on a new provider, simply add a new yaml file to the `conf/` directory with the provider config. See `tests/verifications/conf/together.yaml` for an example.
## Adding New Test Cases
To add new test cases, create appropriate JSON files in the `openai/fixtures/test_cases/` directory following the existing patterns.
To add new test cases, create appropriate JSON files in the `openai_api/fixtures/test_cases/` directory following the existing patterns.
## Structure
- `__init__.py` - Marks the directory as a Python package
- `conftest.py` - Global pytest configuration and fixtures
- `openai/` - Tests specific to OpenAI-compatible APIs
- `conf/` - Provider-specific configuration files
- `openai_api/` - Tests specific to OpenAI-compatible APIs
- `fixtures/` - Test fixtures and utilities
- `fixtures.py` - Provider-specific fixtures
- `load.py` - Utilities for loading test cases

View file

@ -1,6 +1,6 @@
# Test Results Report
*Generated on: 2025-04-14 18:11:37*
*Generated on: 2025-04-16 15:10:57*
*This report was generated by running `python tests/verifications/generate_report.py`*
@ -15,7 +15,7 @@
| Provider | Pass Rate | Tests Passed | Total Tests |
| --- | --- | --- | --- |
| Together | 48.7% | 37 | 76 |
| Together | 51.3% | 39 | 76 |
| Fireworks | 47.4% | 36 | 76 |
| Openai | 100.0% | 52 | 52 |
@ -23,7 +23,7 @@
## Together
*Tests run on: 2025-04-14 18:08:14*
*Tests run on: 2025-04-16 15:03:51*
```bash
# Run all tests for this provider:
@ -49,8 +49,8 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | | ✅ | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | | ✅ |
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ |
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
@ -74,7 +74,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe
## Fireworks
*Tests run on: 2025-04-14 18:04:06*
*Tests run on: 2025-04-16 15:05:54*
```bash
# Run all tests for this provider:
@ -125,7 +125,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=firewor
## Openai
*Tests run on: 2025-04-14 18:09:51*
*Tests run on: 2025-04-16 15:09:18*
```bash
# Run all tests for this provider:

View file

@ -3,14 +3,6 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pytest-json-report",
# "pyyaml",
# ]
# ///
"""
Test Report Generator
@ -67,16 +59,10 @@ RESULTS_DIR.mkdir(exist_ok=True)
# Maximum number of test result files to keep per provider
MAX_RESULTS_PER_PROVIDER = 1
PROVIDER_ORDER = [
DEFAULT_PROVIDERS = [
"together",
"fireworks",
"groq",
"cerebras",
"openai",
"together-llama-stack",
"fireworks-llama-stack",
"groq-llama-stack",
"openai-llama-stack",
]
VERIFICATION_CONFIG = _load_all_verification_configs()
@ -142,6 +128,14 @@ def run_tests(provider, keyword=None):
return None
def run_multiple_tests(providers_to_run: list[str], keyword: str | None):
"""Runs tests for a list of providers."""
print(f"Running tests for providers: {', '.join(providers_to_run)}")
for provider in providers_to_run:
run_tests(provider.strip(), keyword=keyword)
print("Finished running tests.")
def parse_results(
result_file,
) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str], str]:
@ -250,20 +244,6 @@ def parse_results(
return parsed_results, providers_in_file, tests_in_file, run_timestamp_str
def get_all_result_files_by_provider():
"""Get all test result files, keyed by provider."""
provider_results = {}
result_files = list(RESULTS_DIR.glob("*.json"))
for file in result_files:
provider = file.stem
if provider:
provider_results[provider] = file
return provider_results
def generate_report(
results_dict: Dict[str, Any],
providers: Dict[str, Set[str]],
@ -276,6 +256,7 @@ def generate_report(
Args:
results_dict: Aggregated results [provider][model][test_name] -> status.
providers: Dict of all providers and their models {provider: {models}}.
The order of keys in this dict determines the report order.
all_tests: Set of all test names found.
provider_timestamps: Dict of provider to timestamp when tests were run
output_file: Optional path to save the report.
@ -353,22 +334,17 @@ def generate_report(
passed_tests += 1
provider_totals[provider] = (provider_passed, provider_total)
# Add summary table (use passed-in providers dict)
# Add summary table (use the order from the providers dict keys)
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
report.append("| --- | --- | --- | --- |")
for provider in [p for p in PROVIDER_ORDER if p in providers]: # Check against keys of passed-in dict
passed, total = provider_totals.get(provider, (0, 0))
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
for provider in [p for p in providers if p not in PROVIDER_ORDER]: # Check against keys of passed-in dict
# Iterate through providers in the order they appear in the input dict
for provider in providers_sorted.keys():
passed, total = provider_totals.get(provider, (0, 0))
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
report.append("\n")
for provider in sorted(
providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
):
for provider in providers_sorted.keys():
provider_models = providers_sorted[provider] # Use sorted models
if not provider_models:
continue
@ -461,60 +437,62 @@ def main():
"--providers",
type=str,
nargs="+",
help="Specify providers to test (comma-separated or space-separated, default: all)",
help="Specify providers to include/test (comma-separated or space-separated, default: uses DEFAULT_PROVIDERS)",
)
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
args = parser.parse_args()
all_results = {}
# Initialize collections to aggregate results in main
aggregated_providers = defaultdict(set)
final_providers_order = {} # Dictionary to store results, preserving processing order
aggregated_tests = set()
provider_timestamps = {}
if args.run_tests:
# Get list of available providers from command line or use detected providers
if args.providers:
# Handle both comma-separated and space-separated lists
test_providers = []
for provider_arg in args.providers:
# Split by comma if commas are present
if "," in provider_arg:
test_providers.extend(provider_arg.split(","))
else:
test_providers.append(provider_arg)
else:
# Default providers to test
test_providers = PROVIDER_ORDER
for provider in test_providers:
provider = provider.strip() # Remove any whitespace
result_file = run_tests(provider, keyword=args.k)
if result_file:
# Parse and aggregate results
parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
all_results.update(parsed_results)
for prov, models in providers_in_file.items():
aggregated_providers[prov].update(models)
if run_timestamp:
provider_timestamps[prov] = run_timestamp
aggregated_tests.update(tests_in_file)
# 1. Determine the desired list and order of providers
if args.providers:
desired_providers = []
for provider_arg in args.providers:
desired_providers.extend([p.strip() for p in provider_arg.split(",")])
else:
# Use existing results
provider_result_files = get_all_result_files_by_provider()
desired_providers = DEFAULT_PROVIDERS # Use default order/list
for result_file in provider_result_files.values():
# Parse and aggregate results
parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
all_results.update(parsed_results)
for prov, models in providers_in_file.items():
aggregated_providers[prov].update(models)
if run_timestamp:
provider_timestamps[prov] = run_timestamp
aggregated_tests.update(tests_in_file)
# 2. Run tests if requested (using the desired provider list)
if args.run_tests:
run_multiple_tests(desired_providers, args.k)
generate_report(all_results, aggregated_providers, aggregated_tests, provider_timestamps, args.output)
for provider in desired_providers:
# Construct the expected result file path directly
result_file = RESULTS_DIR / f"{provider}.json"
if result_file.exists(): # Check if the specific file exists
print(f"Loading results for {provider} from {result_file}")
try:
parsed_data = parse_results(result_file)
parsed_results, providers_in_file, tests_in_file, run_timestamp = parsed_data
all_results.update(parsed_results)
aggregated_tests.update(tests_in_file)
# Add models for this provider, ensuring it's added in the correct report order
if provider in providers_in_file:
if provider not in final_providers_order:
final_providers_order[provider] = set()
final_providers_order[provider].update(providers_in_file[provider])
if run_timestamp != "Unknown":
provider_timestamps[provider] = run_timestamp
else:
print(
f"Warning: Provider '{provider}' found in desired list but not within its result file data ({result_file})."
)
except Exception as e:
print(f"Error parsing results for provider {provider} from {result_file}: {e}")
else:
# Only print warning if we expected results (i.e., provider was in the desired list)
print(f"Result file for desired provider '{provider}' not found at {result_file}. Skipping.")
# 5. Generate the report using the filtered & ordered results
print(f"Final Provider Order for Report: {list(final_providers_order.keys())}")
generate_report(all_results, final_providers_order, aggregated_tests, provider_timestamps, args.output)
if __name__ == "__main__":

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

17
uv.lock generated
View file

@ -1,4 +1,5 @@
version = 1
revision = 1
requires-python = ">=3.10"
resolution-markers = [
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@ -1410,6 +1411,7 @@ dev = [
{ name = "pytest-asyncio" },
{ name = "pytest-cov" },
{ name = "pytest-html" },
{ name = "pytest-json-report" },
{ name = "ruamel-yaml" },
{ name = "ruff" },
{ name = "types-requests" },
@ -1502,6 +1504,7 @@ requires-dist = [
{ name = "pytest-asyncio", marker = "extra == 'dev'" },
{ name = "pytest-cov", marker = "extra == 'dev'" },
{ name = "pytest-html", marker = "extra == 'dev'" },
{ name = "pytest-json-report", marker = "extra == 'dev'" },
{ name = "python-dotenv" },
{ name = "qdrant-client", marker = "extra == 'unit'" },
{ name = "requests" },
@ -1531,6 +1534,7 @@ requires-dist = [
{ name = "types-setuptools", marker = "extra == 'dev'" },
{ name = "uvicorn", marker = "extra == 'dev'" },
]
provides-extras = ["dev", "unit", "test", "docs", "codegen", "ui"]
[[package]]
name = "llama-stack-client"
@ -2740,6 +2744,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c8/c7/c160021cbecd956cc1a6f79e5fe155f7868b2e5b848f1320dad0b3e3122f/pytest_html-4.1.1-py3-none-any.whl", hash = "sha256:c8152cea03bd4e9bee6d525573b67bbc6622967b72b9628dda0ea3e2a0b5dd71", size = 23491 },
]
[[package]]
name = "pytest-json-report"
version = "1.5.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pytest" },
{ name = "pytest-metadata" },
]
sdist = { url = "https://files.pythonhosted.org/packages/4f/d3/765dae9712fcd68d820338908c1337e077d5fdadccd5cacf95b9b0bea278/pytest-json-report-1.5.0.tar.gz", hash = "sha256:2dde3c647851a19b5f3700729e8310a6e66efb2077d674f27ddea3d34dc615de", size = 21241 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/81/35/d07400c715bf8a88aa0c1ee9c9eb6050ca7fe5b39981f0eea773feeb0681/pytest_json_report-1.5.0-py3-none-any.whl", hash = "sha256:9897b68c910b12a2e48dd849f9a284b2c79a732a8a9cb398452ddd23d3c8c325", size = 13222 },
]
[[package]]
name = "pytest-metadata"
version = "3.1.1"