chore(verification): update README and reorganize generate_report.py (#1978)

# What does this PR do? ## Test Plan uv run --with-editable ".[dev]" python tests/verifications/generate_report.py --run-tests
2025-04-17 10:41:22 -07:00 · 2025-04-17 10:41:22 -07:00 · 8bd6665775
commit 8bd6665775
parent cb874287a4
8 changed files with 1205 additions and 1213 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,6 +46,7 @@ dev = [
    "pytest-asyncio",
    "pytest-cov",
    "pytest-html",
+    "pytest-json-report",
    "nbval",            # For notebook testing
    "black",
    "ruff",
--- a/tests/verifications/README.md
+++ b/tests/verifications/README.md
@ -8,29 +8,44 @@ This framework allows you to run the same set of verification tests against diff

 ## Features

-The verification suite currently tests:
+The verification suite currently tests the following in both streaming and non-streaming modes:

- Basic chat completions (streaming and non-streaming)
+- Basic chat completions
 - Image input capabilities
 - Structured JSON output formatting
 - Tool calling functionality

+## Report
+
+The lastest report can be found at [REPORT.md](REPORT.md).
+
+To update the report, ensure you have the API keys set,
+```bash
+export OPENAI_API_KEY=<your_openai_api_key>
+export FIREWORKS_API_KEY=<your_fireworks_api_key>
+export TOGETHER_API_KEY=<your_together_api_key>
+```
+then run
+```bash
+uv run --with-editable ".[dev]" python tests/verifications/generate_report.py --run-tests
+```
+
 ## Running Tests

 To run the verification tests, use pytest with the following parameters:

 ```bash
 cd llama-stack
-pytest tests/verifications/openai --provider=<provider-name>
+pytest tests/verifications/openai_api --provider=<provider-name>
 ```

 Example:
 ```bash
 # Run all tests
-pytest tests/verifications/openai --provider=together
+pytest tests/verifications/openai_api --provider=together

 # Only run tests with Llama 4 models
-pytest tests/verifications/openai --provider=together -k 'Llama-4'
+pytest tests/verifications/openai_api --provider=together -k 'Llama-4'
 ```

 ### Parameters
@ -41,23 +56,22 @@ pytest tests/verifications/openai --provider=together -k 'Llama-4'

 ## Supported Providers

-The verification suite currently supports:
- OpenAI
- Fireworks
- Together
- Groq
- Cerebras
+The verification suite supports any provider with an OpenAI compatible endpoint.
+
+See `tests/verifications/conf/` for the list of supported providers.
+
+To run on a new provider, simply add a new yaml file to the `conf/` directory with the provider config. See `tests/verifications/conf/together.yaml` for an example.

 ## Adding New Test Cases

-To add new test cases, create appropriate JSON files in the `openai/fixtures/test_cases/` directory following the existing patterns.
+To add new test cases, create appropriate JSON files in the `openai_api/fixtures/test_cases/` directory following the existing patterns.


 ## Structure

 - `__init__.py` - Marks the directory as a Python package
- `conftest.py` - Global pytest configuration and fixtures
- `openai/` - Tests specific to OpenAI-compatible APIs
+- `conf/` - Provider-specific configuration files
+- `openai_api/` - Tests specific to OpenAI-compatible APIs
  - `fixtures/` - Test fixtures and utilities
    - `fixtures.py` - Provider-specific fixtures
    - `load.py` - Utilities for loading test cases
--- a/tests/verifications/REPORT.md
+++ b/tests/verifications/REPORT.md
@ -1,6 +1,6 @@
 # Test Results Report

-*Generated on: 2025-04-14 18:11:37*
+*Generated on: 2025-04-16 15:10:57*

 *This report was generated by running `python tests/verifications/generate_report.py`*

@ -15,7 +15,7 @@

 | Provider | Pass Rate | Tests Passed | Total Tests |
 | --- | --- | --- | --- |
-| Together | 48.7% | 37 | 76 |
+| Together | 51.3% | 39 | 76 |
 | Fireworks | 47.4% | 36 | 76 |
 | Openai | 100.0% | 52 | 52 |

@ -23,7 +23,7 @@

 ## Together

-*Tests run on: 2025-04-14 18:08:14*
+*Tests run on: 2025-04-16 15:03:51*

 ```bash
 # Run all tests for this provider:
@ -49,8 +49,8 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe
 | test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
 | test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
 | test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ✅ | ✅ |
-| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ❌ | ✅ |
+| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ | ✅ |
+| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ | ✅ |
 | test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
 | test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ |
 | test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
@ -74,7 +74,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe

 ## Fireworks

-*Tests run on: 2025-04-14 18:04:06*
+*Tests run on: 2025-04-16 15:05:54*

 ```bash
 # Run all tests for this provider:
@ -125,7 +125,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=firewor

 ## Openai

-*Tests run on: 2025-04-14 18:09:51*
+*Tests run on: 2025-04-16 15:09:18*

 ```bash
 # Run all tests for this provider:
--- a/tests/verifications/generate_report.py
+++ b/tests/verifications/generate_report.py
@ -3,14 +3,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "pytest-json-report",
-#     "pyyaml",
-# ]
-# ///
 """
 Test Report Generator

@ -67,16 +59,10 @@ RESULTS_DIR.mkdir(exist_ok=True)
 # Maximum number of test result files to keep per provider
 MAX_RESULTS_PER_PROVIDER = 1

-PROVIDER_ORDER = [
+DEFAULT_PROVIDERS = [
    "together",
    "fireworks",
-    "groq",
-    "cerebras",
    "openai",
-    "together-llama-stack",
-    "fireworks-llama-stack",
-    "groq-llama-stack",
-    "openai-llama-stack",
 ]

 VERIFICATION_CONFIG = _load_all_verification_configs()
@ -142,6 +128,14 @@ def run_tests(provider, keyword=None):
        return None


+def run_multiple_tests(providers_to_run: list[str], keyword: str | None):
+    """Runs tests for a list of providers."""
+    print(f"Running tests for providers: {', '.join(providers_to_run)}")
+    for provider in providers_to_run:
+        run_tests(provider.strip(), keyword=keyword)
+    print("Finished running tests.")
+
+
 def parse_results(
    result_file,
 ) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str], str]:
@ -250,20 +244,6 @@ def parse_results(
    return parsed_results, providers_in_file, tests_in_file, run_timestamp_str


-def get_all_result_files_by_provider():
-    """Get all test result files, keyed by provider."""
-    provider_results = {}
-
-    result_files = list(RESULTS_DIR.glob("*.json"))
-
-    for file in result_files:
-        provider = file.stem
-        if provider:
-            provider_results[provider] = file
-
-    return provider_results
-
-
 def generate_report(
    results_dict: Dict[str, Any],
    providers: Dict[str, Set[str]],
@ -276,6 +256,7 @@ def generate_report(
    Args:
        results_dict: Aggregated results [provider][model][test_name] -> status.
        providers: Dict of all providers and their models {provider: {models}}.
+                   The order of keys in this dict determines the report order.
        all_tests: Set of all test names found.
        provider_timestamps: Dict of provider to timestamp when tests were run
        output_file: Optional path to save the report.
@ -353,22 +334,17 @@ def generate_report(
                                passed_tests += 1
        provider_totals[provider] = (provider_passed, provider_total)

-    # Add summary table (use passed-in providers dict)
+    # Add summary table (use the order from the providers dict keys)
    report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
    report.append("| --- | --- | --- | --- |")
-    for provider in [p for p in PROVIDER_ORDER if p in providers]:  # Check against keys of passed-in dict
-        passed, total = provider_totals.get(provider, (0, 0))
-        pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
-        report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
-    for provider in [p for p in providers if p not in PROVIDER_ORDER]:  # Check against keys of passed-in dict
+    # Iterate through providers in the order they appear in the input dict
+    for provider in providers_sorted.keys():
        passed, total = provider_totals.get(provider, (0, 0))
        pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
        report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
    report.append("\n")

-    for provider in sorted(
-        providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
-    ):
+    for provider in providers_sorted.keys():
        provider_models = providers_sorted[provider]  # Use sorted models
        if not provider_models:
            continue
@ -461,60 +437,62 @@ def main():
        "--providers",
        type=str,
        nargs="+",
-        help="Specify providers to test (comma-separated or space-separated, default: all)",
+        help="Specify providers to include/test (comma-separated or space-separated, default: uses DEFAULT_PROVIDERS)",
    )
    parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
    parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
    args = parser.parse_args()

    all_results = {}
-    # Initialize collections to aggregate results in main
-    aggregated_providers = defaultdict(set)
+    final_providers_order = {}  # Dictionary to store results, preserving processing order
    aggregated_tests = set()
    provider_timestamps = {}

-    if args.run_tests:
-        # Get list of available providers from command line or use detected providers
-        if args.providers:
-            # Handle both comma-separated and space-separated lists
-            test_providers = []
-            for provider_arg in args.providers:
-                # Split by comma if commas are present
-                if "," in provider_arg:
-                    test_providers.extend(provider_arg.split(","))
-                else:
-                    test_providers.append(provider_arg)
-        else:
-            # Default providers to test
-            test_providers = PROVIDER_ORDER
-
-        for provider in test_providers:
-            provider = provider.strip()  # Remove any whitespace
-            result_file = run_tests(provider, keyword=args.k)
-            if result_file:
-                # Parse and aggregate results
-                parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
-                all_results.update(parsed_results)
-                for prov, models in providers_in_file.items():
-                    aggregated_providers[prov].update(models)
-                    if run_timestamp:
-                        provider_timestamps[prov] = run_timestamp
-                aggregated_tests.update(tests_in_file)
+    # 1. Determine the desired list and order of providers
+    if args.providers:
+        desired_providers = []
+        for provider_arg in args.providers:
+            desired_providers.extend([p.strip() for p in provider_arg.split(",")])
    else:
-        # Use existing results
-        provider_result_files = get_all_result_files_by_provider()
+        desired_providers = DEFAULT_PROVIDERS  # Use default order/list

-        for result_file in provider_result_files.values():
-            # Parse and aggregate results
-            parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
-            all_results.update(parsed_results)
-            for prov, models in providers_in_file.items():
-                aggregated_providers[prov].update(models)
-                if run_timestamp:
-                    provider_timestamps[prov] = run_timestamp
-            aggregated_tests.update(tests_in_file)
+    # 2. Run tests if requested (using the desired provider list)
+    if args.run_tests:
+        run_multiple_tests(desired_providers, args.k)

-    generate_report(all_results, aggregated_providers, aggregated_tests, provider_timestamps, args.output)
+    for provider in desired_providers:
+        # Construct the expected result file path directly
+        result_file = RESULTS_DIR / f"{provider}.json"
+
+        if result_file.exists():  # Check if the specific file exists
+            print(f"Loading results for {provider} from {result_file}")
+            try:
+                parsed_data = parse_results(result_file)
+                parsed_results, providers_in_file, tests_in_file, run_timestamp = parsed_data
+                all_results.update(parsed_results)
+                aggregated_tests.update(tests_in_file)
+
+                # Add models for this provider, ensuring it's added in the correct report order
+                if provider in providers_in_file:
+                    if provider not in final_providers_order:
+                        final_providers_order[provider] = set()
+                    final_providers_order[provider].update(providers_in_file[provider])
+                    if run_timestamp != "Unknown":
+                        provider_timestamps[provider] = run_timestamp
+                else:
+                    print(
+                        f"Warning: Provider '{provider}' found in desired list but not within its result file data ({result_file})."
+                    )
+
+            except Exception as e:
+                print(f"Error parsing results for provider {provider} from {result_file}: {e}")
+        else:
+            # Only print warning if we expected results (i.e., provider was in the desired list)
+            print(f"Result file for desired provider '{provider}' not found at {result_file}. Skipping.")
+
+    # 5. Generate the report using the filtered & ordered results
+    print(f"Final Provider Order for Report: {list(final_providers_order.keys())}")
+    generate_report(all_results, final_providers_order, aggregated_tests, provider_timestamps, args.output)


 if __name__ == "__main__":
--- a/tests/verifications/test_results/fireworks.json
+++ b/tests/verifications/test_results/fireworks.json
--- a/tests/verifications/test_results/openai.json
+++ b/tests/verifications/test_results/openai.json
--- a/tests/verifications/test_results/together.json
+++ b/tests/verifications/test_results/together.json
--- a/uv.lock
+++ b/uv.lock
@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.10"
 resolution-markers = [
    "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@ -1410,6 +1411,7 @@ dev = [
    { name = "pytest-asyncio" },
    { name = "pytest-cov" },
    { name = "pytest-html" },
+    { name = "pytest-json-report" },
    { name = "ruamel-yaml" },
    { name = "ruff" },
    { name = "types-requests" },
@ -1502,6 +1504,7 @@ requires-dist = [
    { name = "pytest-asyncio", marker = "extra == 'dev'" },
    { name = "pytest-cov", marker = "extra == 'dev'" },
    { name = "pytest-html", marker = "extra == 'dev'" },
+    { name = "pytest-json-report", marker = "extra == 'dev'" },
    { name = "python-dotenv" },
    { name = "qdrant-client", marker = "extra == 'unit'" },
    { name = "requests" },
@ -1531,6 +1534,7 @@ requires-dist = [
    { name = "types-setuptools", marker = "extra == 'dev'" },
    { name = "uvicorn", marker = "extra == 'dev'" },
 ]
+provides-extras = ["dev", "unit", "test", "docs", "codegen", "ui"]

 [[package]]
 name = "llama-stack-client"
@ -2740,6 +2744,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c8/c7/c160021cbecd956cc1a6f79e5fe155f7868b2e5b848f1320dad0b3e3122f/pytest_html-4.1.1-py3-none-any.whl", hash = "sha256:c8152cea03bd4e9bee6d525573b67bbc6622967b72b9628dda0ea3e2a0b5dd71", size = 23491 },
 ]

+[[package]]
+name = "pytest-json-report"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+    { name = "pytest-metadata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4f/d3/765dae9712fcd68d820338908c1337e077d5fdadccd5cacf95b9b0bea278/pytest-json-report-1.5.0.tar.gz", hash = "sha256:2dde3c647851a19b5f3700729e8310a6e66efb2077d674f27ddea3d34dc615de", size = 21241 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/35/d07400c715bf8a88aa0c1ee9c9eb6050ca7fe5b39981f0eea773feeb0681/pytest_json_report-1.5.0-py3-none-any.whl", hash = "sha256:9897b68c910b12a2e48dd849f9a284b2c79a732a8a9cb398452ddd23d3c8c325", size = 13222 },
+]
+
 [[package]]
 name = "pytest-metadata"
 version = "3.1.1"