mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
feat(verification): various improvements (#1921)
# What does this PR do? - provider and their models now live in config.yaml - better distinguish different cases within a test - add model key to surface provider's model_id - include example command to rerun single test case ## Test Plan <img width="1173" alt="image" src="https://github.com/user-attachments/assets/b414baf0-c768-451f-8c3b-c2905cf36fac" />
This commit is contained in:
parent
09a83b1ec1
commit
14146e4b3f
22 changed files with 4449 additions and 8810 deletions
|
@ -1,6 +1,6 @@
|
|||
# Test Results Report
|
||||
|
||||
*Generated on: 2025-04-08 21:14:02*
|
||||
*Generated on: 2025-04-09 22:52:19*
|
||||
|
||||
*This report was generated by running `python tests/verifications/generate_report.py`*
|
||||
|
||||
|
@ -23,66 +23,107 @@
|
|||
|
||||
## Together
|
||||
|
||||
*Tests run on: 2025-04-08 16:19:59*
|
||||
*Tests run on: 2025-04-09 22:50:58*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=together -v
|
||||
# Run all tests for this provider:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -v
|
||||
|
||||
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -k "test_chat_non_streaming_basic and earth"
|
||||
```
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
||||
|
||||
**Model Key (Together)**
|
||||
|
||||
| Display Name | Full Model ID |
|
||||
| --- | --- |
|
||||
| Llama-3.3-70B-Instruct | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
|
||||
| Llama-4-Maverick-Instruct | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` |
|
||||
| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
|
||||
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
|
||||
| --- | --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_image (case 0) | ⚪ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (case 1) | ✅ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_image | ⚪ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
|
||||
|
||||
## Fireworks
|
||||
|
||||
*Tests run on: 2025-04-08 16:18:28*
|
||||
*Tests run on: 2025-04-09 22:50:02*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=fireworks -v
|
||||
# Run all tests for this provider:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -v
|
||||
|
||||
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -k "test_chat_non_streaming_basic and earth"
|
||||
```
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
||||
|
||||
**Model Key (Fireworks)**
|
||||
|
||||
| Display Name | Full Model ID |
|
||||
| --- | --- |
|
||||
| Llama-3.3-70B-Instruct | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
|
||||
| Llama-4-Maverick-Instruct | `accounts/fireworks/models/llama4-maverick-instruct-basic` |
|
||||
| Llama-4-Scout-Instruct | `accounts/fireworks/models/llama4-scout-instruct-basic` |
|
||||
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
|
||||
| --- | --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 1) | ❌ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
|
||||
## Openai
|
||||
|
||||
*Tests run on: 2025-04-08 16:22:02*
|
||||
*Tests run on: 2025-04-09 22:51:44*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=openai -v
|
||||
# Run all tests for this provider:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -v
|
||||
|
||||
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -k "test_chat_non_streaming_basic and earth"
|
||||
```
|
||||
|
||||
|
||||
**Model Key (Openai)**
|
||||
|
||||
| Display Name | Full Model ID |
|
||||
| --- | --- |
|
||||
| gpt-4o | `gpt-4o` |
|
||||
| gpt-4o-mini | `gpt-4o-mini` |
|
||||
|
||||
|
||||
| Test | gpt-4o | gpt-4o-mini |
|
||||
| --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ |
|
||||
| test_chat_streaming_image (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ✅ |
|
||||
| test_chat_streaming_image | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ✅ |
|
||||
|
|
10
tests/verifications/conf/cerebras.yaml
Normal file
10
tests/verifications/conf/cerebras.yaml
Normal file
|
@ -0,0 +1,10 @@
|
|||
base_url: https://api.cerebras.ai/v1
|
||||
api_key_var: CEREBRAS_API_KEY
|
||||
models:
|
||||
- llama-3.3-70b
|
||||
model_display_names:
|
||||
llama-3.3-70b: Llama-3.3-70B-Instruct
|
||||
test_exclusions:
|
||||
llama-3.3-70b:
|
||||
- test_chat_non_streaming_image
|
||||
- test_chat_streaming_image
|
14
tests/verifications/conf/fireworks.yaml
Normal file
14
tests/verifications/conf/fireworks.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
base_url: https://api.fireworks.ai/inference/v1
|
||||
api_key_var: FIREWORKS_API_KEY
|
||||
models:
|
||||
- accounts/fireworks/models/llama-v3p3-70b-instruct
|
||||
- accounts/fireworks/models/llama4-scout-instruct-basic
|
||||
- accounts/fireworks/models/llama4-maverick-instruct-basic
|
||||
model_display_names:
|
||||
accounts/fireworks/models/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
|
||||
accounts/fireworks/models/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
|
||||
accounts/fireworks/models/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
|
||||
test_exclusions:
|
||||
accounts/fireworks/models/llama-v3p3-70b-instruct:
|
||||
- test_chat_non_streaming_image
|
||||
- test_chat_streaming_image
|
14
tests/verifications/conf/groq.yaml
Normal file
14
tests/verifications/conf/groq.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
base_url: https://api.groq.com/openai/v1
|
||||
api_key_var: GROQ_API_KEY
|
||||
models:
|
||||
- llama-3.3-70b-versatile
|
||||
- llama-4-scout-17b-16e-instruct
|
||||
- llama-4-maverick-17b-128e-instruct
|
||||
model_display_names:
|
||||
llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
|
||||
llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
|
||||
llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
|
||||
test_exclusions:
|
||||
llama-3.3-70b-versatile:
|
||||
- test_chat_non_streaming_image
|
||||
- test_chat_streaming_image
|
9
tests/verifications/conf/openai.yaml
Normal file
9
tests/verifications/conf/openai.yaml
Normal file
|
@ -0,0 +1,9 @@
|
|||
base_url: https://api.openai.com/v1
|
||||
api_key_var: OPENAI_API_KEY
|
||||
models:
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
model_display_names:
|
||||
gpt-4o: gpt-4o
|
||||
gpt-4o-mini: gpt-4o-mini
|
||||
test_exclusions: {}
|
14
tests/verifications/conf/together.yaml
Normal file
14
tests/verifications/conf/together.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
base_url: https://api.together.xyz/v1
|
||||
api_key_var: TOGETHER_API_KEY
|
||||
models:
|
||||
- meta-llama/Llama-3.3-70B-Instruct-Turbo
|
||||
- meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
- meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||
model_display_names:
|
||||
meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
|
||||
meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
|
||||
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
|
||||
test_exclusions:
|
||||
meta-llama/Llama-3.3-70B-Instruct-Turbo:
|
||||
- test_chat_non_streaming_image
|
||||
- test_chat_streaming_image
|
|
@ -4,6 +4,10 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
|
@ -14,7 +18,7 @@ def pytest_addoption(parser):
|
|||
parser.addoption(
|
||||
"--api-key",
|
||||
action="store",
|
||||
help="API key",
|
||||
help="API key to use for the provider",
|
||||
)
|
||||
parser.addoption(
|
||||
"--provider",
|
||||
|
@ -24,5 +28,64 @@ def pytest_addoption(parser):
|
|||
|
||||
|
||||
pytest_plugins = [
|
||||
"tests.verifications.openai.fixtures.fixtures",
|
||||
"pytest_jsonreport",
|
||||
"tests.verifications.openai_api.fixtures.fixtures",
|
||||
"tests.verifications.openai_api.fixtures.load",
|
||||
]
|
||||
|
||||
|
||||
@pytest.hookimpl(optionalhook=True)
|
||||
def pytest_json_runtest_metadata(item, call):
|
||||
"""Add model and case_id to pytest-json report metadata."""
|
||||
metadata = {}
|
||||
nodeid = item.nodeid
|
||||
|
||||
# 1. Extract model from callspec if available
|
||||
model = item.callspec.params.get("model") if hasattr(item, "callspec") else None
|
||||
if model:
|
||||
metadata["model"] = model
|
||||
else:
|
||||
# Fallback: Try parsing from nodeid (less reliable)
|
||||
match_model = re.search(r"\[(.*?)-", nodeid)
|
||||
if match_model:
|
||||
model = match_model.group(1) # Store model even if found via fallback
|
||||
metadata["model"] = model
|
||||
else:
|
||||
print(f"Warning: Could not determine model for test {nodeid}")
|
||||
model = None # Ensure model is None if not found
|
||||
|
||||
# 2. Extract case_id using the known model string if possible
|
||||
if model:
|
||||
# Construct a regex pattern to find the case_id *after* the model name and a hyphen.
|
||||
# Escape the model name in case it contains regex special characters.
|
||||
pattern = re.escape(model) + r"-(.*?)\]$"
|
||||
match_case = re.search(pattern, nodeid)
|
||||
if match_case:
|
||||
case_id = match_case.group(1)
|
||||
metadata["case_id"] = case_id
|
||||
else:
|
||||
# Fallback if the pattern didn't match (e.g., nodeid format unexpected)
|
||||
# Try the old less specific regex as a last resort.
|
||||
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
|
||||
if match_case_fallback:
|
||||
case_id = match_case_fallback.group(1)
|
||||
metadata["case_id"] = case_id
|
||||
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid}")
|
||||
else:
|
||||
print(f"Warning: Could not parse case_id from nodeid {nodeid} even with fallback.")
|
||||
if "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
|
||||
metadata["case_id"] = "parsing_failed"
|
||||
elif "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
|
||||
# Cannot reliably parse case_id without model, but we know it's a case test.
|
||||
# Try the generic fallback regex.
|
||||
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
|
||||
if match_case_fallback:
|
||||
case_id = match_case_fallback.group(1)
|
||||
metadata["case_id"] = case_id
|
||||
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid} (model unknown)")
|
||||
else:
|
||||
print(f"Warning: Could not parse case_id from nodeid {nodeid} (model unknown)")
|
||||
metadata["case_id"] = "parsing_failed_no_model"
|
||||
# else: Not a test with a model or case param we need to handle.
|
||||
|
||||
return metadata
|
||||
|
|
|
@ -4,27 +4,48 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# /// script
|
||||
# requires-python = ">=3.10"
|
||||
# dependencies = [
|
||||
# "pytest-json-report",
|
||||
# "pyyaml",
|
||||
# ]
|
||||
# ///
|
||||
"""
|
||||
Test Report Generator
|
||||
|
||||
Requirements:
|
||||
pip install pytest-json-report
|
||||
Description:
|
||||
This script runs pytest tests (specifically designed for OpenAI API compatibility checks)
|
||||
for different providers, aggregates the results from JSON reports, and generates
|
||||
a markdown summary report (REPORT.md).
|
||||
|
||||
It automatically cleans up old test result files, keeping only the latest
|
||||
per provider.
|
||||
|
||||
|
||||
Configuration:
|
||||
- Provider details (models, display names) are loaded from `tests/verifications/config.yaml`.
|
||||
- Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
|
||||
- Test results are stored in `tests/verifications/test_results/`.
|
||||
|
||||
Usage:
|
||||
# Generate a report using existing test results
|
||||
# Generate a report using the latest existing test results
|
||||
python tests/verifications/generate_report.py
|
||||
|
||||
# Run tests and generate a report
|
||||
# Run tests for all configured providers and generate a report
|
||||
python tests/verifications/generate_report.py --run-tests
|
||||
|
||||
# Run tests for specific providers
|
||||
# Run tests only for specific providers (space-separated)
|
||||
python tests/verifications/generate_report.py --run-tests --providers fireworks openai
|
||||
|
||||
# Run tests matching a keyword expression (uses pytest -k)
|
||||
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "streaming"
|
||||
|
||||
# Run a specific test case for a provider
|
||||
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "test_chat_streaming_basic and basic_earth"
|
||||
|
||||
# Save the report to a custom location
|
||||
python tests/verifications/generate_report.py --output custom_report.md
|
||||
|
||||
# Clean up old test result files
|
||||
python tests/verifications/generate_report.py --cleanup
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
@ -35,6 +56,9 @@ import subprocess
|
|||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, DefaultDict, Dict, Set, Tuple
|
||||
|
||||
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
|
||||
|
||||
# Define the root directory for test results
|
||||
RESULTS_DIR = Path(__file__).parent / "test_results"
|
||||
|
@ -43,17 +67,12 @@ RESULTS_DIR.mkdir(exist_ok=True)
|
|||
# Maximum number of test result files to keep per provider
|
||||
MAX_RESULTS_PER_PROVIDER = 1
|
||||
|
||||
# Custom order of providers
|
||||
PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"]
|
||||
|
||||
# Dictionary to store providers and their models (will be populated dynamically)
|
||||
PROVIDERS = defaultdict(set)
|
||||
|
||||
# Tests will be dynamically extracted from results
|
||||
ALL_TESTS = set()
|
||||
VERIFICATION_CONFIG = _load_all_verification_configs()
|
||||
|
||||
|
||||
def run_tests(provider):
|
||||
def run_tests(provider, keyword=None):
|
||||
"""Run pytest for a specific provider and save results"""
|
||||
print(f"Running tests for provider: {provider}")
|
||||
|
||||
|
@ -61,20 +80,28 @@ def run_tests(provider):
|
|||
result_file = RESULTS_DIR / f"{provider}_{timestamp}.json"
|
||||
temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json"
|
||||
|
||||
# Determine project root directory relative to this script
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
|
||||
# Run pytest with JSON output
|
||||
cmd = [
|
||||
"python",
|
||||
"-m",
|
||||
"pytest",
|
||||
"tests/verifications/openai/test_chat_completion.py",
|
||||
"tests/verifications/openai_api/test_chat_completion.py",
|
||||
f"--provider={provider}",
|
||||
"-v",
|
||||
"--json-report",
|
||||
f"--json-report-file={temp_json_file}",
|
||||
]
|
||||
|
||||
# Append -k argument if provided
|
||||
if keyword:
|
||||
cmd.extend(["-k", keyword])
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
# Run subprocess with cwd set to project root
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, cwd=project_root)
|
||||
print(f"Pytest exit code: {result.returncode}")
|
||||
|
||||
# Check if the JSON file was created
|
||||
|
@ -103,18 +130,30 @@ def run_tests(provider):
|
|||
return None
|
||||
|
||||
|
||||
def parse_results(result_file):
|
||||
"""Parse the test results file and extract pass/fail by model and test"""
|
||||
def parse_results(
|
||||
result_file,
|
||||
) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str]]:
|
||||
"""Parse a single test results file.
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- parsed_results: DefaultDict[provider, DefaultDict[model, Dict[test_name, pass_status]]]
|
||||
- providers_in_file: DefaultDict[provider, Set[model]] found in this file.
|
||||
- tests_in_file: Set[test_name] found in this file.
|
||||
"""
|
||||
if not os.path.exists(result_file):
|
||||
print(f"Results file does not exist: {result_file}")
|
||||
return {}
|
||||
# Return empty defaultdicts/set matching the type hint
|
||||
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set()
|
||||
|
||||
with open(result_file, "r") as f:
|
||||
results = json.load(f)
|
||||
|
||||
# Initialize results dictionary
|
||||
parsed_results = defaultdict(lambda: defaultdict(dict))
|
||||
provider = os.path.basename(result_file).split("_")[0]
|
||||
# Initialize results dictionary with specific types
|
||||
parsed_results: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] = defaultdict(lambda: defaultdict(dict))
|
||||
providers_in_file: DefaultDict[str, Set[str]] = defaultdict(set)
|
||||
tests_in_file: Set[str] = set()
|
||||
provider: str = os.path.basename(result_file).split("_")[0]
|
||||
|
||||
# Debug: Print summary of test results
|
||||
print(f"Test results summary for {provider}:")
|
||||
|
@ -127,124 +166,72 @@ def parse_results(result_file):
|
|||
# Extract test results
|
||||
if "tests" not in results or not results["tests"]:
|
||||
print(f"No test results found in {result_file}")
|
||||
return parsed_results
|
||||
# Return empty defaultdicts/set matching the type hint
|
||||
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set()
|
||||
|
||||
# Map for normalizing model names
|
||||
model_name_map = {
|
||||
"Llama-3.3-8B-Instruct": "Llama-3.3-8B-Instruct",
|
||||
"Llama-3.3-70B-Instruct": "Llama-3.3-70B-Instruct",
|
||||
"Llama-3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct",
|
||||
"Llama-4-Scout-17B-16E": "Llama-4-Scout-17B-16E-Instruct",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "Llama-4-Scout-17B-16E-Instruct",
|
||||
"Llama-4-Maverick-17B-128E": "Llama-4-Maverick-17B-128E-Instruct",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "Llama-4-Maverick-17B-128E-Instruct",
|
||||
"gpt-4o": "gpt-4o",
|
||||
"gpt-4o-mini": "gpt-4o-mini",
|
||||
}
|
||||
|
||||
# Keep track of all models found for this provider
|
||||
provider_models = set()
|
||||
|
||||
# Track all unique test cases for each base test
|
||||
test_case_counts = defaultdict(int)
|
||||
|
||||
# First pass: count the number of cases for each test
|
||||
# Process the tests
|
||||
for test in results["tests"]:
|
||||
test_id = test.get("nodeid", "")
|
||||
|
||||
if "call" in test:
|
||||
test_name = test_id.split("::")[1].split("[")[0]
|
||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
||||
if input_output_match:
|
||||
test_case_counts[test_name] += 1
|
||||
if not (call_phase := test.get("call")):
|
||||
continue
|
||||
call_outcome = call_phase.get("outcome")
|
||||
if call_outcome not in ("passed", "failed"):
|
||||
continue
|
||||
|
||||
# Second pass: process the tests with case numbers only for tests with multiple cases
|
||||
for test in results["tests"]:
|
||||
test_id = test.get("nodeid", "")
|
||||
outcome = test.get("outcome", "")
|
||||
# --- Extract data from metadata ---
|
||||
metadata = test.get("metadata", {})
|
||||
model = metadata.get("model")
|
||||
case_id = metadata.get("case_id") # String ID (if provided)
|
||||
case_index = metadata.get("case_index") # Integer index (if no ID provided)
|
||||
|
||||
# Only process tests that have been executed (not setup errors)
|
||||
if "call" in test:
|
||||
# Regular test that actually ran
|
||||
test_name = test_id.split("::")[1].split("[")[0]
|
||||
# Check if we have a model and at least one case identifier
|
||||
if not model or (case_id is None and case_index is None):
|
||||
print(
|
||||
f"Warning: Missing 'model' or case identifier ('case_id'/'case_index') metadata for test: {test_id}. Skipping."
|
||||
)
|
||||
continue
|
||||
|
||||
# Extract input_output parameter to differentiate between test cases
|
||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
||||
input_output_index = input_output_match.group(1) if input_output_match else ""
|
||||
try:
|
||||
test_name_base = test_id.split("::")[1].split("[")[0]
|
||||
except (IndexError, ValueError) as e:
|
||||
print(f"Warning: Could not parse base test name for {test_id}. Error: {e}. Skipping.")
|
||||
continue
|
||||
|
||||
# Create a more detailed test name with case number only if there are multiple cases
|
||||
detailed_test_name = test_name
|
||||
if input_output_index and test_case_counts[test_name] > 1:
|
||||
detailed_test_name = f"{test_name} (case {input_output_index})"
|
||||
# Construct detailed test name using ID or index
|
||||
if case_id is not None:
|
||||
detailed_test_name = f"{test_name_base} ({case_id})"
|
||||
elif case_index == 0:
|
||||
# If case_id is missing and index is 0, assume single case, use base name only
|
||||
detailed_test_name = test_name_base
|
||||
elif case_index is not None: # case_index > 0
|
||||
# Use case_index for naming if case_id wasn't provided and index > 0
|
||||
detailed_test_name = f"{test_name_base} (case{case_index})"
|
||||
else:
|
||||
# This case should be prevented by the earlier check, but handle defensively
|
||||
print(f"Error: No case identifier found for test {test_id} after initial check. Skipping.")
|
||||
continue
|
||||
|
||||
# Track all unique test names
|
||||
ALL_TESTS.add(detailed_test_name)
|
||||
# Populate collections for this file
|
||||
tests_in_file.add(detailed_test_name)
|
||||
providers_in_file[provider].add(model)
|
||||
|
||||
# Extract model name from test_id using a more robust pattern
|
||||
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
|
||||
if model_match:
|
||||
raw_model = model_match.group(1)
|
||||
model = model_name_map.get(raw_model, raw_model)
|
||||
if call_outcome == "passed":
|
||||
parsed_results[provider][model][detailed_test_name] = True
|
||||
elif call_outcome == "failed":
|
||||
parsed_results[provider][model][detailed_test_name] = False
|
||||
|
||||
# Add to set of known models for this provider
|
||||
provider_models.add(model)
|
||||
# Final Summary Warning (Optional)
|
||||
if not parsed_results.get(provider):
|
||||
print(f"Warning: No valid test results parsed for provider {provider} from file {result_file}")
|
||||
|
||||
# Also update the global PROVIDERS dictionary
|
||||
PROVIDERS[provider].add(model)
|
||||
|
||||
# Store the result
|
||||
if outcome == "passed":
|
||||
parsed_results[provider][model][detailed_test_name] = True
|
||||
else:
|
||||
parsed_results[provider][model][detailed_test_name] = False
|
||||
|
||||
print(f"Parsed test result: {detailed_test_name} for model {model}: {outcome}")
|
||||
elif outcome == "error" and "setup" in test and test.get("setup", {}).get("outcome") == "failed":
|
||||
# This is a setup failure, which likely means a configuration issue
|
||||
# Extract the base test name and model name
|
||||
parts = test_id.split("::")
|
||||
if len(parts) > 1:
|
||||
test_name = parts[1].split("[")[0]
|
||||
|
||||
# Extract input_output parameter to differentiate between test cases
|
||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
||||
input_output_index = input_output_match.group(1) if input_output_match else ""
|
||||
|
||||
# Create a more detailed test name with case number only if there are multiple cases
|
||||
detailed_test_name = test_name
|
||||
if input_output_index and test_case_counts[test_name] > 1:
|
||||
detailed_test_name = f"{test_name} (case {input_output_index})"
|
||||
|
||||
if detailed_test_name in ALL_TESTS:
|
||||
# Use a more robust pattern for model extraction
|
||||
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
|
||||
if model_match:
|
||||
raw_model = model_match.group(1)
|
||||
model = model_name_map.get(raw_model, raw_model)
|
||||
|
||||
# Add to set of known models for this provider
|
||||
provider_models.add(model)
|
||||
|
||||
# Also update the global PROVIDERS dictionary
|
||||
PROVIDERS[provider].add(model)
|
||||
|
||||
# Mark setup failures as false (failed)
|
||||
parsed_results[provider][model][detailed_test_name] = False
|
||||
print(f"Parsed setup failure: {detailed_test_name} for model {model}")
|
||||
|
||||
# Debug: Print parsed results
|
||||
if not parsed_results[provider]:
|
||||
print(f"Warning: No test results parsed for provider {provider}")
|
||||
else:
|
||||
for model, tests in parsed_results[provider].items():
|
||||
print(f"Model {model}: {len(tests)} test results")
|
||||
|
||||
return parsed_results
|
||||
return parsed_results, providers_in_file, tests_in_file
|
||||
|
||||
|
||||
def cleanup_old_results():
|
||||
"""Clean up old test result files, keeping only the newest N per provider"""
|
||||
for provider in PROVIDERS.keys():
|
||||
def cleanup_old_results(providers_to_clean: Dict[str, Set[str]]):
|
||||
"""Clean up old test result files, keeping only the newest N per provider."""
|
||||
# Use the passed-in providers dictionary
|
||||
for provider in providers_to_clean.keys():
|
||||
# Get all result files for this provider
|
||||
provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json"))
|
||||
|
||||
|
@ -289,8 +276,17 @@ def get_latest_results_by_provider():
|
|||
return provider_results
|
||||
|
||||
|
||||
def generate_report(results_dict, output_file=None):
|
||||
"""Generate the markdown report"""
|
||||
def generate_report(
|
||||
results_dict: Dict[str, Any], providers: Dict[str, Set[str]], all_tests: Set[str], output_file=None
|
||||
):
|
||||
"""Generate the markdown report.
|
||||
|
||||
Args:
|
||||
results_dict: Aggregated results [provider][model][test_name] -> status.
|
||||
providers: Dict of all providers and their models {provider: {models}}.
|
||||
all_tests: Set of all test names found.
|
||||
output_file: Optional path to save the report.
|
||||
"""
|
||||
if output_file is None:
|
||||
# Default to creating the report in the same directory as this script
|
||||
output_file = Path(__file__).parent / "REPORT.md"
|
||||
|
@ -299,8 +295,8 @@ def generate_report(results_dict, output_file=None):
|
|||
|
||||
# Get the timestamp from result files
|
||||
provider_timestamps = {}
|
||||
provider_results = get_latest_results_by_provider()
|
||||
for provider, result_file in provider_results.items():
|
||||
provider_results_files = get_latest_results_by_provider()
|
||||
for provider, result_file in provider_results_files.items():
|
||||
# Extract timestamp from filename (format: provider_timestamp.json)
|
||||
try:
|
||||
timestamp_str = result_file.stem.split("_")[1]
|
||||
|
@ -310,12 +306,33 @@ def generate_report(results_dict, output_file=None):
|
|||
except (IndexError, ValueError):
|
||||
provider_timestamps[provider] = "Unknown"
|
||||
|
||||
# Convert provider model sets to sorted lists
|
||||
for provider in PROVIDERS:
|
||||
PROVIDERS[provider] = sorted(PROVIDERS[provider])
|
||||
# Convert provider model sets to sorted lists (use passed-in providers dict)
|
||||
providers_sorted = {prov: sorted(models) for prov, models in providers.items()}
|
||||
|
||||
# Sort tests alphabetically
|
||||
sorted_tests = sorted(ALL_TESTS)
|
||||
# Sort tests alphabetically (use passed-in all_tests set)
|
||||
sorted_tests = sorted(all_tests)
|
||||
|
||||
# Calculate counts for each base test name
|
||||
base_test_case_counts: DefaultDict[str, int] = defaultdict(int)
|
||||
base_test_name_map: Dict[str, str] = {}
|
||||
for test_name in sorted_tests:
|
||||
match = re.match(r"^(.*?)( \([^)]+\))?$", test_name)
|
||||
if match:
|
||||
base_name = match.group(1).strip()
|
||||
base_test_case_counts[base_name] += 1
|
||||
base_test_name_map[test_name] = base_name
|
||||
else:
|
||||
# Should not happen with current naming, but handle defensively
|
||||
base_test_case_counts[test_name] += 1
|
||||
base_test_name_map[test_name] = test_name
|
||||
|
||||
if not sorted_tests:
|
||||
print("Warning: No test results found to generate a report.")
|
||||
# Optionally create an empty report or return early
|
||||
with open(output_file, "w") as f:
|
||||
f.write("# Test Results Report\n\nNo test results found.\n")
|
||||
print(f"Generated empty report: {output_file}")
|
||||
return
|
||||
|
||||
report = ["# Test Results Report\n"]
|
||||
report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
|
||||
|
@ -336,19 +353,15 @@ def generate_report(results_dict, output_file=None):
|
|||
# Add a summary section
|
||||
report.append("## Summary\n")
|
||||
|
||||
# Count total tests and passes
|
||||
# Count total tests and passes (use passed-in providers and all_tests)
|
||||
total_tests = 0
|
||||
passed_tests = 0
|
||||
provider_totals = {}
|
||||
|
||||
# Prepare summary data
|
||||
for provider in PROVIDERS.keys():
|
||||
for provider, models in providers_sorted.items():
|
||||
provider_passed = 0
|
||||
provider_total = 0
|
||||
|
||||
if provider in results_dict:
|
||||
provider_models = PROVIDERS[provider]
|
||||
for model in provider_models:
|
||||
for model in models:
|
||||
if model in results_dict[provider]:
|
||||
model_results = results_dict[provider][model]
|
||||
for test in sorted_tests:
|
||||
|
@ -358,33 +371,26 @@ def generate_report(results_dict, output_file=None):
|
|||
if model_results[test]:
|
||||
provider_passed += 1
|
||||
passed_tests += 1
|
||||
|
||||
provider_totals[provider] = (provider_passed, provider_total)
|
||||
|
||||
# Add summary table
|
||||
# Add summary table (use passed-in providers dict)
|
||||
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
|
||||
report.append("| --- | --- | --- | --- |")
|
||||
|
||||
# Use the custom order for summary table
|
||||
for provider in [p for p in PROVIDER_ORDER if p in PROVIDERS]:
|
||||
for provider in [p for p in PROVIDER_ORDER if p in providers]: # Check against keys of passed-in dict
|
||||
passed, total = provider_totals.get(provider, (0, 0))
|
||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||
|
||||
# Add providers not in the custom order
|
||||
for provider in [p for p in PROVIDERS if p not in PROVIDER_ORDER]:
|
||||
for provider in [p for p in providers if p not in PROVIDER_ORDER]: # Check against keys of passed-in dict
|
||||
passed, total = provider_totals.get(provider, (0, 0))
|
||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||
|
||||
report.append("\n")
|
||||
|
||||
# Process each provider in the custom order, then any additional providers
|
||||
for provider in sorted(
|
||||
PROVIDERS.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
|
||||
providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
|
||||
):
|
||||
if not PROVIDERS[provider]:
|
||||
# Skip providers with no models
|
||||
provider_models = providers_sorted[provider] # Use sorted models
|
||||
if not provider_models:
|
||||
continue
|
||||
|
||||
report.append(f"\n## {provider.capitalize()}\n")
|
||||
|
@ -394,34 +400,70 @@ def generate_report(results_dict, output_file=None):
|
|||
report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")
|
||||
|
||||
# Add test command for reproducing results
|
||||
test_cmd = f"pytest tests/verifications/openai/test_chat_completion.py --provider={provider} -v"
|
||||
report.append(f"```bash\n{test_cmd}\n```\n")
|
||||
test_cmd_all = f"pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -v"
|
||||
report.append(f"```bash\n# Run all tests for this provider:\n{test_cmd_all}\n")
|
||||
|
||||
# Get the relevant models for this provider
|
||||
provider_models = PROVIDERS[provider]
|
||||
# Find an example test with a case ID
|
||||
example_base_test_name = None
|
||||
example_case_id = None
|
||||
# Get first test as fallback base, handle empty list
|
||||
first_test_name = sorted_tests[0] if sorted_tests else "unknown_test"
|
||||
|
||||
# Create table header with models as columns
|
||||
header = "| Test | " + " | ".join(provider_models) + " |"
|
||||
match = re.match(r"^(.*?) \((.*?)\)$", first_test_name)
|
||||
if match:
|
||||
example_base_test_name = match.group(1).strip()
|
||||
example_case_id = match.group(2).strip()
|
||||
else:
|
||||
example_base_test_name = first_test_name
|
||||
|
||||
base_name = base_test_name_map.get(test, test) # Get base name
|
||||
case_count = base_test_case_counts.get(base_name, 1) # Get count
|
||||
filter_str = f"{example_base_test_name} and {example_case_id}" if case_count > 1 else example_base_test_name
|
||||
|
||||
test_cmd_specific_case = (
|
||||
f'pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -k "{filter_str}"'
|
||||
)
|
||||
report.append(
|
||||
f"# Example: Run only the '{example_case_id}' case of {example_base_test_name}:\n{test_cmd_specific_case}\n```\n"
|
||||
)
|
||||
|
||||
# Get display names (use passed-in providers dict)
|
||||
provider_config = VERIFICATION_CONFIG.get("providers", {}).get(provider, {})
|
||||
display_name_map = provider_config.get("model_display_names", {})
|
||||
|
||||
# Add Model Key Table (use provider_models)
|
||||
report.append(f"\n**Model Key ({provider.capitalize()})**\n")
|
||||
provider_key_lines = ["| Display Name | Full Model ID |", "| --- | --- |"]
|
||||
for model_id in provider_models:
|
||||
display_name = display_name_map.get(model_id, model_id)
|
||||
provider_key_lines.append(f"| {display_name} | `{model_id}` |")
|
||||
report.extend(provider_key_lines)
|
||||
report.append("\n")
|
||||
|
||||
# Create results table header (use provider_models)
|
||||
display_names = [display_name_map.get(m, m) for m in provider_models]
|
||||
header = "| Test | " + " | ".join(display_names) + " |"
|
||||
separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
|
||||
|
||||
report.append(header)
|
||||
report.append(separator)
|
||||
|
||||
# Get results for this provider
|
||||
provider_results = results_dict.get(provider, {})
|
||||
# Get results for this provider from results_dict
|
||||
provider_results_data = results_dict.get(provider, {})
|
||||
|
||||
# Add rows for each test
|
||||
# Add rows for each test (use sorted_tests)
|
||||
for test in sorted_tests:
|
||||
row = f"| {test} |"
|
||||
# Determine display name based on case count
|
||||
base_name = base_test_name_map.get(test, test) # Get base name
|
||||
case_count = base_test_case_counts.get(base_name, 1) # Get count
|
||||
display_test_name = base_name if case_count == 1 else test # Choose display name
|
||||
row = f"| {display_test_name} |" # Use display name
|
||||
|
||||
# Add results for each model in this test
|
||||
for model in provider_models:
|
||||
if model in provider_results and test in provider_results[model]:
|
||||
result = pass_icon if provider_results[model][test] else fail_icon
|
||||
for model_id in provider_models:
|
||||
if model_id in provider_results_data and test in provider_results_data[model_id]:
|
||||
result = pass_icon if provider_results_data[model_id][test] else fail_icon
|
||||
else:
|
||||
result = na_icon
|
||||
row += f" {result} |"
|
||||
|
||||
report.append(row)
|
||||
|
||||
# Write to file
|
||||
|
@ -442,9 +484,13 @@ def main():
|
|||
help="Specify providers to test (comma-separated or space-separated, default: all)",
|
||||
)
|
||||
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
|
||||
parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
|
||||
args = parser.parse_args()
|
||||
|
||||
all_results = {}
|
||||
# Initialize collections to aggregate results in main
|
||||
aggregated_providers = defaultdict(set)
|
||||
aggregated_tests = set()
|
||||
|
||||
if args.run_tests:
|
||||
# Get list of available providers from command line or use detected providers
|
||||
|
@ -463,22 +509,31 @@ def main():
|
|||
|
||||
for provider in test_providers:
|
||||
provider = provider.strip() # Remove any whitespace
|
||||
result_file = run_tests(provider)
|
||||
result_file = run_tests(provider, keyword=args.k)
|
||||
if result_file:
|
||||
provider_results = parse_results(result_file)
|
||||
all_results.update(provider_results)
|
||||
# Parse and aggregate results
|
||||
parsed_results, providers_in_file, tests_in_file = parse_results(result_file)
|
||||
all_results.update(parsed_results)
|
||||
for prov, models in providers_in_file.items():
|
||||
aggregated_providers[prov].update(models)
|
||||
aggregated_tests.update(tests_in_file)
|
||||
else:
|
||||
# Use existing results
|
||||
provider_result_files = get_latest_results_by_provider()
|
||||
|
||||
for result_file in provider_result_files.values():
|
||||
provider_results = parse_results(result_file)
|
||||
all_results.update(provider_results)
|
||||
# Parse and aggregate results
|
||||
parsed_results, providers_in_file, tests_in_file = parse_results(result_file)
|
||||
all_results.update(parsed_results)
|
||||
for prov, models in providers_in_file.items():
|
||||
aggregated_providers[prov].update(models)
|
||||
aggregated_tests.update(tests_in_file)
|
||||
|
||||
# Generate the report
|
||||
generate_report(all_results, args.output)
|
||||
# Generate the report, passing aggregated data
|
||||
generate_report(all_results, aggregated_providers, aggregated_tests, args.output)
|
||||
|
||||
cleanup_old_results()
|
||||
# Cleanup, passing aggregated providers
|
||||
cleanup_old_results(aggregated_providers)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,97 +0,0 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def providers_model_mapping():
|
||||
"""
|
||||
Mapping from model names used in test cases to provider's model names.
|
||||
"""
|
||||
return {
|
||||
"fireworks": {
|
||||
"Llama-3.3-70B-Instruct": "accounts/fireworks/models/llama-v3p1-70b-instruct",
|
||||
"Llama-3.2-11B-Vision-Instruct": "accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "accounts/fireworks/models/llama4-scout-instruct-basic",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "accounts/fireworks/models/llama4-maverick-instruct-basic",
|
||||
},
|
||||
"together": {
|
||||
"Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
||||
"Llama-3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
},
|
||||
"groq": {
|
||||
"Llama-3.3-70B-Instruct": "llama-3.3-70b-versatile",
|
||||
"Llama-3.2-11B-Vision-Instruct": "llama-3.2-11b-vision-preview",
|
||||
"Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
|
||||
"Llama-4-Maverick-17B-128E-Instruct": "llama-4-maverick-17b-128e-instruct",
|
||||
},
|
||||
"cerebras": {
|
||||
"Llama-3.3-70B-Instruct": "llama-3.3-70b",
|
||||
},
|
||||
"openai": {
|
||||
"gpt-4o": "gpt-4o",
|
||||
"gpt-4o-mini": "gpt-4o-mini",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def provider_metadata():
|
||||
return {
|
||||
"fireworks": ("https://api.fireworks.ai/inference/v1", "FIREWORKS_API_KEY"),
|
||||
"together": ("https://api.together.xyz/v1", "TOGETHER_API_KEY"),
|
||||
"groq": ("https://api.groq.com/openai/v1", "GROQ_API_KEY"),
|
||||
"cerebras": ("https://api.cerebras.ai/v1", "CEREBRAS_API_KEY"),
|
||||
"openai": ("https://api.openai.com/v1", "OPENAI_API_KEY"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def provider(request, provider_metadata):
|
||||
provider = request.config.getoption("--provider")
|
||||
base_url = request.config.getoption("--base-url")
|
||||
|
||||
if provider and base_url and provider_metadata[provider][0] != base_url:
|
||||
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
|
||||
|
||||
if not provider:
|
||||
if not base_url:
|
||||
raise ValueError("Provider and base URL are not provided")
|
||||
for provider, metadata in provider_metadata.items():
|
||||
if metadata[0] == base_url:
|
||||
provider = provider
|
||||
break
|
||||
|
||||
return provider
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_url(request, provider, provider_metadata):
|
||||
return request.config.getoption("--base-url") or provider_metadata[provider][0]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_key(request, provider, provider_metadata):
|
||||
return request.config.getoption("--api-key") or os.getenv(provider_metadata[provider][1])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_mapping(provider, providers_model_mapping):
|
||||
return providers_model_mapping[provider]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def openai_client(base_url, api_key):
|
||||
return OpenAI(
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
)
|
|
@ -1,202 +0,0 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from tests.verifications.openai.fixtures.load import load_test_cases
|
||||
|
||||
chat_completion_test_cases = load_test_cases("chat_completion")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def correct_model_name(model, provider, providers_model_mapping):
|
||||
"""Return the provider-specific model name based on the generic model name."""
|
||||
mapping = providers_model_mapping[provider]
|
||||
if model not in mapping:
|
||||
pytest.skip(f"Provider {provider} does not support model {model}")
|
||||
return mapping[model]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_basic(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=False,
|
||||
)
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert input_output["output"].lower() in response.choices[0].message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_streaming_basic(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=True,
|
||||
)
|
||||
content = ""
|
||||
for chunk in response:
|
||||
content += chunk.choices[0].delta.content or ""
|
||||
|
||||
# TODO: add detailed type validation
|
||||
|
||||
assert input_output["output"].lower() in content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_image(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=False,
|
||||
)
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert input_output["output"].lower() in response.choices[0].message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_streaming_image(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
stream=True,
|
||||
)
|
||||
content = ""
|
||||
for chunk in response:
|
||||
content += chunk.choices[0].delta.content or ""
|
||||
|
||||
# TODO: add detailed type validation
|
||||
|
||||
assert input_output["output"].lower() in content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_structured_output(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
response_format=input_output["input"]["response_format"],
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
maybe_json_content = response.choices[0].message.content
|
||||
|
||||
validate_structured_output(maybe_json_content, input_output["output"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
response_format=input_output["input"]["response_format"],
|
||||
stream=True,
|
||||
)
|
||||
maybe_json_content = ""
|
||||
for chunk in response:
|
||||
maybe_json_content += chunk.choices[0].delta.content or ""
|
||||
validate_structured_output(maybe_json_content, input_output["output"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
chat_completion_test_cases["test_tool_calling"]["test_params"]["model"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"input_output",
|
||||
chat_completion_test_cases["test_tool_calling"]["test_params"]["input_output"],
|
||||
)
|
||||
def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name):
|
||||
response = openai_client.chat.completions.create(
|
||||
model=correct_model_name,
|
||||
messages=input_output["input"]["messages"],
|
||||
tools=input_output["input"]["tools"],
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert len(response.choices[0].message.tool_calls) > 0
|
||||
assert input_output["output"] == "get_weather_tool_call"
|
||||
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
|
||||
# TODO: add detailed type validation
|
||||
|
||||
|
||||
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
|
||||
if schema_name == "valid_calendar_event":
|
||||
|
||||
class CalendarEvent(BaseModel):
|
||||
name: str
|
||||
date: str
|
||||
participants: list[str]
|
||||
|
||||
try:
|
||||
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
|
||||
return calendar_event
|
||||
except Exception:
|
||||
return None
|
||||
elif schema_name == "valid_math_reasoning":
|
||||
|
||||
class Step(BaseModel):
|
||||
explanation: str
|
||||
output: str
|
||||
|
||||
class MathReasoning(BaseModel):
|
||||
steps: list[Step]
|
||||
final_answer: str
|
||||
|
||||
try:
|
||||
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
|
||||
return math_reasoning
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
|
||||
structured_output = get_structured_output(maybe_json_content, schema_name)
|
||||
assert structured_output is not None
|
||||
if schema_name == "valid_calendar_event":
|
||||
assert structured_output.name is not None
|
||||
assert structured_output.date is not None
|
||||
assert len(structured_output.participants) == 2
|
||||
elif schema_name == "valid_math_reasoning":
|
||||
assert len(structured_output.final_answer) > 0
|
105
tests/verifications/openai_api/fixtures/fixtures.py
Normal file
105
tests/verifications/openai_api/fixtures/fixtures.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
# --- Helper Function to Load Config ---
|
||||
def _load_all_verification_configs():
|
||||
"""Load and aggregate verification configs from the conf/ directory."""
|
||||
# Note: Path is relative to *this* file (fixtures.py)
|
||||
conf_dir = Path(__file__).parent.parent.parent / "conf"
|
||||
if not conf_dir.is_dir():
|
||||
# Use pytest.fail if called during test collection, otherwise raise error
|
||||
# For simplicity here, we'll raise an error, assuming direct calls
|
||||
# are less likely or can handle it.
|
||||
raise FileNotFoundError(f"Verification config directory not found at {conf_dir}")
|
||||
|
||||
all_provider_configs = {}
|
||||
yaml_files = list(conf_dir.glob("*.yaml"))
|
||||
if not yaml_files:
|
||||
raise FileNotFoundError(f"No YAML configuration files found in {conf_dir}")
|
||||
|
||||
for config_path in yaml_files:
|
||||
provider_name = config_path.stem
|
||||
try:
|
||||
with open(config_path, "r") as f:
|
||||
provider_config = yaml.safe_load(f)
|
||||
if provider_config:
|
||||
all_provider_configs[provider_name] = provider_config
|
||||
else:
|
||||
# Log warning if possible, or just skip empty files silently
|
||||
print(f"Warning: Config file {config_path} is empty or invalid.")
|
||||
except Exception as e:
|
||||
raise IOError(f"Error loading config file {config_path}: {e}") from e
|
||||
|
||||
return {"providers": all_provider_configs}
|
||||
|
||||
|
||||
# --- End Helper Function ---
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def verification_config():
|
||||
"""Pytest fixture to provide the loaded verification config."""
|
||||
try:
|
||||
return _load_all_verification_configs()
|
||||
except (FileNotFoundError, IOError) as e:
|
||||
pytest.fail(str(e)) # Fail test collection if config loading fails
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def provider(request, verification_config):
|
||||
provider = request.config.getoption("--provider")
|
||||
base_url = request.config.getoption("--base-url")
|
||||
|
||||
if provider and base_url and verification_config["providers"][provider]["base_url"] != base_url:
|
||||
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
|
||||
|
||||
if not provider:
|
||||
if not base_url:
|
||||
raise ValueError("Provider and base URL are not provided")
|
||||
for provider, metadata in verification_config["providers"].items():
|
||||
if metadata["base_url"] == base_url:
|
||||
provider = provider
|
||||
break
|
||||
|
||||
return provider
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_url(request, provider, verification_config):
|
||||
return request.config.getoption("--base-url") or verification_config["providers"][provider]["base_url"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_key(request, provider, verification_config):
|
||||
provider_conf = verification_config.get("providers", {}).get(provider, {})
|
||||
api_key_env_var = provider_conf.get("api_key_var")
|
||||
|
||||
key_from_option = request.config.getoption("--api-key")
|
||||
key_from_env = os.getenv(api_key_env_var) if api_key_env_var else None
|
||||
|
||||
final_key = key_from_option or key_from_env
|
||||
return final_key
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def model_mapping(provider, providers_model_mapping):
|
||||
return providers_model_mapping[provider]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def openai_client(base_url, api_key):
|
||||
return OpenAI(
|
||||
base_url=base_url,
|
||||
api_key=api_key,
|
||||
)
|
|
@ -1,31 +1,24 @@
|
|||
test_chat_basic:
|
||||
test_name: test_chat_basic
|
||||
test_params:
|
||||
input_output:
|
||||
- input:
|
||||
case:
|
||||
- case_id: "earth"
|
||||
input:
|
||||
messages:
|
||||
- content: Which planet do humans live on?
|
||||
role: user
|
||||
output: Earth
|
||||
- input:
|
||||
- case_id: "saturn"
|
||||
input:
|
||||
messages:
|
||||
- content: Which planet has rings around it with a name starting with letter
|
||||
S?
|
||||
role: user
|
||||
output: Saturn
|
||||
model:
|
||||
- Llama-3.3-8B-Instruct
|
||||
- Llama-3.3-70B-Instruct
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
test_chat_image:
|
||||
test_name: test_chat_image
|
||||
test_params:
|
||||
input_output:
|
||||
case:
|
||||
- input:
|
||||
messages:
|
||||
- content:
|
||||
|
@ -36,18 +29,12 @@ test_chat_image:
|
|||
type: image_url
|
||||
role: user
|
||||
output: llama
|
||||
model:
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
test_chat_structured_output:
|
||||
test_name: test_chat_structured_output
|
||||
test_params:
|
||||
input_output:
|
||||
- input:
|
||||
case:
|
||||
- case_id: "calendar"
|
||||
input:
|
||||
messages:
|
||||
- content: Extract the event information.
|
||||
role: system
|
||||
|
@ -77,7 +64,8 @@ test_chat_structured_output:
|
|||
type: object
|
||||
type: json_schema
|
||||
output: valid_calendar_event
|
||||
- input:
|
||||
- case_id: "math"
|
||||
input:
|
||||
messages:
|
||||
- content: You are a helpful math tutor. Guide the user through the solution
|
||||
step by step.
|
||||
|
@ -118,19 +106,10 @@ test_chat_structured_output:
|
|||
type: object
|
||||
type: json_schema
|
||||
output: valid_math_reasoning
|
||||
model:
|
||||
- Llama-3.3-8B-Instruct
|
||||
- Llama-3.3-70B-Instruct
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
||||
test_tool_calling:
|
||||
test_name: test_tool_calling
|
||||
test_params:
|
||||
input_output:
|
||||
case:
|
||||
- input:
|
||||
messages:
|
||||
- content: You are a helpful assistant that can use tools to get information.
|
||||
|
@ -152,11 +131,3 @@ test_tool_calling:
|
|||
type: object
|
||||
type: function
|
||||
output: get_weather_tool_call
|
||||
model:
|
||||
- Llama-3.3-70B-Instruct
|
||||
- Llama-4-Scout-17B-16E
|
||||
- Llama-4-Scout-17B-16E-Instruct
|
||||
- Llama-4-Maverick-17B-128E
|
||||
- Llama-4-Maverick-17B-128E-Instruct
|
||||
- gpt-4o
|
||||
- gpt-4o-mini
|
271
tests/verifications/openai_api/test_chat_completion.py
Normal file
271
tests/verifications/openai_api/test_chat_completion.py
Normal file
|
@ -0,0 +1,271 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
|
||||
from tests.verifications.openai_api.fixtures.load import load_test_cases
|
||||
|
||||
chat_completion_test_cases = load_test_cases("chat_completion")
|
||||
|
||||
|
||||
def case_id_generator(case):
|
||||
"""Generate a test ID from the case's 'case_id' field, or use a default."""
|
||||
case_id = case.get("case_id")
|
||||
if isinstance(case_id, (str, int)):
|
||||
return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
|
||||
return None
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
"""Dynamically parametrize tests based on the selected provider and config."""
|
||||
if "model" in metafunc.fixturenames:
|
||||
provider = metafunc.config.getoption("provider")
|
||||
if not provider:
|
||||
print("Warning: --provider not specified. Skipping model parametrization.")
|
||||
metafunc.parametrize("model", [])
|
||||
return
|
||||
|
||||
try:
|
||||
config_data = _load_all_verification_configs()
|
||||
except (FileNotFoundError, IOError) as e:
|
||||
print(f"ERROR loading verification configs: {e}")
|
||||
config_data = {"providers": {}}
|
||||
|
||||
provider_config = config_data.get("providers", {}).get(provider)
|
||||
if provider_config:
|
||||
models = provider_config.get("models", [])
|
||||
if models:
|
||||
metafunc.parametrize("model", models)
|
||||
else:
|
||||
print(f"Warning: No models found for provider '{provider}' in config.")
|
||||
metafunc.parametrize("model", []) # Parametrize empty if no models found
|
||||
else:
|
||||
print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
|
||||
metafunc.parametrize("model", []) # Parametrize empty if provider not found
|
||||
|
||||
|
||||
def should_skip_test(verification_config, provider, model, test_name_base):
|
||||
"""Check if a test should be skipped based on config exclusions."""
|
||||
provider_config = verification_config.get("providers", {}).get(provider)
|
||||
if not provider_config:
|
||||
return False # No config for provider, don't skip
|
||||
|
||||
exclusions = provider_config.get("test_exclusions", {}).get(model, [])
|
||||
return test_name_base in exclusions
|
||||
|
||||
|
||||
# Helper to get the base test name from the request object
|
||||
def get_base_test_name(request):
|
||||
return request.node.originalname
|
||||
|
||||
|
||||
# --- Test Functions ---
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
|
||||
ids=case_id_generator,
|
||||
)
|
||||
def test_chat_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
|
||||
test_name_base = get_base_test_name(request)
|
||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=case["input"]["messages"],
|
||||
stream=False,
|
||||
)
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert case["output"].lower() in response.choices[0].message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
|
||||
ids=case_id_generator,
|
||||
)
|
||||
def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):
|
||||
test_name_base = get_base_test_name(request)
|
||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=case["input"]["messages"],
|
||||
stream=True,
|
||||
)
|
||||
content = ""
|
||||
for chunk in response:
|
||||
content += chunk.choices[0].delta.content or ""
|
||||
|
||||
# TODO: add detailed type validation
|
||||
|
||||
assert case["output"].lower() in content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
|
||||
ids=case_id_generator,
|
||||
)
|
||||
def test_chat_non_streaming_image(request, openai_client, model, provider, verification_config, case):
|
||||
test_name_base = get_base_test_name(request)
|
||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=case["input"]["messages"],
|
||||
stream=False,
|
||||
)
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert case["output"].lower() in response.choices[0].message.content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
|
||||
ids=case_id_generator,
|
||||
)
|
||||
def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):
|
||||
test_name_base = get_base_test_name(request)
|
||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=case["input"]["messages"],
|
||||
stream=True,
|
||||
)
|
||||
content = ""
|
||||
for chunk in response:
|
||||
content += chunk.choices[0].delta.content or ""
|
||||
|
||||
# TODO: add detailed type validation
|
||||
|
||||
assert case["output"].lower() in content.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
|
||||
ids=case_id_generator,
|
||||
)
|
||||
def test_chat_non_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
|
||||
test_name_base = get_base_test_name(request)
|
||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=case["input"]["messages"],
|
||||
response_format=case["input"]["response_format"],
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
maybe_json_content = response.choices[0].message.content
|
||||
|
||||
validate_structured_output(maybe_json_content, case["output"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
|
||||
ids=case_id_generator,
|
||||
)
|
||||
def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
|
||||
test_name_base = get_base_test_name(request)
|
||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=case["input"]["messages"],
|
||||
response_format=case["input"]["response_format"],
|
||||
stream=True,
|
||||
)
|
||||
maybe_json_content = ""
|
||||
for chunk in response:
|
||||
maybe_json_content += chunk.choices[0].delta.content or ""
|
||||
validate_structured_output(maybe_json_content, case["output"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
|
||||
ids=case_id_generator,
|
||||
)
|
||||
def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
|
||||
test_name_base = get_base_test_name(request)
|
||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=case["input"]["messages"],
|
||||
tools=case["input"]["tools"],
|
||||
stream=False,
|
||||
)
|
||||
|
||||
assert response.choices[0].message.role == "assistant"
|
||||
assert len(response.choices[0].message.tool_calls) > 0
|
||||
assert case["output"] == "get_weather_tool_call"
|
||||
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
|
||||
# TODO: add detailed type validation
|
||||
|
||||
|
||||
# --- Helper functions (structured output validation) ---
|
||||
|
||||
|
||||
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
|
||||
if schema_name == "valid_calendar_event":
|
||||
|
||||
class CalendarEvent(BaseModel):
|
||||
name: str
|
||||
date: str
|
||||
participants: list[str]
|
||||
|
||||
try:
|
||||
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
|
||||
return calendar_event
|
||||
except Exception:
|
||||
return None
|
||||
elif schema_name == "valid_math_reasoning":
|
||||
|
||||
class Step(BaseModel):
|
||||
explanation: str
|
||||
output: str
|
||||
|
||||
class MathReasoning(BaseModel):
|
||||
steps: list[Step]
|
||||
final_answer: str
|
||||
|
||||
try:
|
||||
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
|
||||
return math_reasoning
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
|
||||
structured_output = get_structured_output(maybe_json_content, schema_name)
|
||||
assert structured_output is not None
|
||||
if schema_name == "valid_calendar_event":
|
||||
assert structured_output.name is not None
|
||||
assert structured_output.date is not None
|
||||
assert len(structured_output.participants) == 2
|
||||
elif schema_name == "valid_math_reasoning":
|
||||
assert len(structured_output.final_answer) > 0
|
File diff suppressed because it is too large
Load diff
1329
tests/verifications/test_results/fireworks_1744264202.json
Normal file
1329
tests/verifications/test_results/fireworks_1744264202.json
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
868
tests/verifications/test_results/openai_1744264304.json
Normal file
868
tests/verifications/test_results/openai_1744264304.json
Normal file
|
@ -0,0 +1,868 @@
|
|||
{
|
||||
"created": 1744264338.9923031,
|
||||
"duration": 32.825536012649536,
|
||||
"exitcode": 0,
|
||||
"root": "/Users/erichuang/projects/llama-stack",
|
||||
"environment": {},
|
||||
"summary": {
|
||||
"passed": 22,
|
||||
"total": 22,
|
||||
"collected": 22
|
||||
},
|
||||
"collectors": [
|
||||
{
|
||||
"nodeid": "",
|
||||
"outcome": "passed",
|
||||
"result": [
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
||||
"type": "Module"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
||||
"outcome": "passed",
|
||||
"result": [
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
|
||||
"type": "Function",
|
||||
"lineno": 72
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
|
||||
"type": "Function",
|
||||
"lineno": 72
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
|
||||
"type": "Function",
|
||||
"lineno": 72
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
|
||||
"type": "Function",
|
||||
"lineno": 72
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
|
||||
"type": "Function",
|
||||
"lineno": 91
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
|
||||
"type": "Function",
|
||||
"lineno": 91
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
|
||||
"type": "Function",
|
||||
"lineno": 91
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
|
||||
"type": "Function",
|
||||
"lineno": 91
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
|
||||
"type": "Function",
|
||||
"lineno": 115
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
|
||||
"type": "Function",
|
||||
"lineno": 115
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
|
||||
"type": "Function",
|
||||
"lineno": 134
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
|
||||
"type": "Function",
|
||||
"lineno": 134
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
|
||||
"type": "Function",
|
||||
"lineno": 158
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
|
||||
"type": "Function",
|
||||
"lineno": 158
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||
"type": "Function",
|
||||
"lineno": 158
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
|
||||
"type": "Function",
|
||||
"lineno": 158
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
|
||||
"type": "Function",
|
||||
"lineno": 181
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
|
||||
"type": "Function",
|
||||
"lineno": 181
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||
"type": "Function",
|
||||
"lineno": 181
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
|
||||
"type": "Function",
|
||||
"lineno": 181
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
|
||||
"type": "Function",
|
||||
"lineno": 203
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
|
||||
"type": "Function",
|
||||
"lineno": 203
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"tests": [
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
|
||||
"lineno": 72,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_basic[gpt-4o-earth]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-earth",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "earth"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.05381445901002735,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.49848275003023446,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00018287496641278267,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
|
||||
"lineno": 72,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_basic[gpt-4o-saturn]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-saturn",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "saturn"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.007965500000864267,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.9293275829404593,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00018229195848107338,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
|
||||
"lineno": 72,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_basic[gpt-4o-mini-earth]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-earth",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "earth"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.00875679193995893,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.5793640419142321,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0005307920509949327,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
|
||||
"lineno": 72,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-saturn",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "saturn"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.01076845801435411,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.8752291660057381,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0004834589781239629,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
|
||||
"lineno": 91,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_basic[gpt-4o-earth]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-earth",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "earth"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.01662245800253004,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.8336971249664202,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0024086670018732548,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
|
||||
"lineno": 91,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_basic[gpt-4o-saturn]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-saturn",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "saturn"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.009416291955858469,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.43594495789147913,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0009131249971687794,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
|
||||
"lineno": 91,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_basic[gpt-4o-mini-earth]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-earth",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "earth"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.013155042077414691,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.6119836670113727,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00023804197553545237,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
|
||||
"lineno": 91,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_basic[gpt-4o-mini-saturn]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-saturn",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "saturn"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.009004916995763779,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.8327413749648258,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00046841695439070463,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
|
||||
"lineno": 115,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_image[gpt-4o-case0]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-case0",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "case0"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.009574208059348166,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 2.221839000005275,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00015945907216519117,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
|
||||
"lineno": 115,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_image[gpt-4o-mini-case0]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-case0",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "case0"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.0084402080392465,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 2.298736457945779,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0002423750702291727,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
|
||||
"lineno": 134,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_image[gpt-4o-case0]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-case0",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "case0"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.007330416003242135,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 4.062959833070636,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00015470804646611214,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
|
||||
"lineno": 134,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_image[gpt-4o-mini-case0]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-case0",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "case0"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.019998832955025136,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 2.609432084020227,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.005618917057290673,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
|
||||
"lineno": 158,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_structured_output[gpt-4o-calendar]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-calendar",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "calendar"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.00867662497330457,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.6856697499752045,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00018445902969688177,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
|
||||
"lineno": 158,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_structured_output[gpt-4o-math]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-math",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "math"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.01139050000347197,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 2.764390083961189,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0003164170775562525,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||
"lineno": 158,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-calendar",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "calendar"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.01321374997496605,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.8284227909753099,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00030170800164341927,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
|
||||
"lineno": 158,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-math",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "math"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.013477458036504686,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 2.4146235829684883,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00025754200760275126,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
|
||||
"lineno": 181,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_structured_output[gpt-4o-calendar]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-calendar",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "calendar"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.006940583931282163,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.5102092920569703,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00023379107005894184,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
|
||||
"lineno": 181,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_structured_output[gpt-4o-math]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-math",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "math"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.007166999974288046,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 3.5751801669830456,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00015041697770357132,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||
"lineno": 181,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-calendar",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "calendar"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.010652625001966953,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.6648182499920949,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0008647920330986381,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
|
||||
"lineno": 181,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_streaming_structured_output[gpt-4o-mini-math]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-math",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "math"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.007372208056040108,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 2.80747462506406,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.00028124998789280653,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
|
||||
"lineno": 203,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_tool_calling[gpt-4o-case0]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-case0",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o",
|
||||
"case_id": "case0"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.01625587500166148,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.6878769160248339,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0002637499710544944,
|
||||
"outcome": "passed"
|
||||
}
|
||||
},
|
||||
{
|
||||
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
|
||||
"lineno": 203,
|
||||
"outcome": "passed",
|
||||
"keywords": [
|
||||
"test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
|
||||
"parametrize",
|
||||
"pytestmark",
|
||||
"gpt-4o-mini-case0",
|
||||
"test_chat_completion.py",
|
||||
"openai_api",
|
||||
"verifications",
|
||||
"tests",
|
||||
"llama-stack",
|
||||
""
|
||||
],
|
||||
"metadata": {
|
||||
"model": "gpt-4o-mini",
|
||||
"case_id": "case0"
|
||||
},
|
||||
"setup": {
|
||||
"duration": 0.008817250025458634,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"call": {
|
||||
"duration": 0.7181202919455245,
|
||||
"outcome": "passed"
|
||||
},
|
||||
"teardown": {
|
||||
"duration": 0.0017147079342976213,
|
||||
"outcome": "passed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
File diff suppressed because it is too large
Load diff
1420
tests/verifications/test_results/together_1744264258.json
Normal file
1420
tests/verifications/test_results/together_1744264258.json
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue