mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
feat(verification): various improvements (#1921)
# What does this PR do? - provider and their models now live in config.yaml - better distinguish different cases within a test - add model key to surface provider's model_id - include example command to rerun single test case ## Test Plan <img width="1173" alt="image" src="https://github.com/user-attachments/assets/b414baf0-c768-451f-8c3b-c2905cf36fac" />
This commit is contained in:
parent
09a83b1ec1
commit
14146e4b3f
22 changed files with 4449 additions and 8810 deletions
|
@ -1,6 +1,6 @@
|
||||||
# Test Results Report
|
# Test Results Report
|
||||||
|
|
||||||
*Generated on: 2025-04-08 21:14:02*
|
*Generated on: 2025-04-09 22:52:19*
|
||||||
|
|
||||||
*This report was generated by running `python tests/verifications/generate_report.py`*
|
*This report was generated by running `python tests/verifications/generate_report.py`*
|
||||||
|
|
||||||
|
@ -23,66 +23,107 @@
|
||||||
|
|
||||||
## Together
|
## Together
|
||||||
|
|
||||||
*Tests run on: 2025-04-08 16:19:59*
|
*Tests run on: 2025-04-09 22:50:58*
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pytest tests/verifications/openai/test_chat_completion.py --provider=together -v
|
# Run all tests for this provider:
|
||||||
|
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -v
|
||||||
|
|
||||||
|
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||||
|
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -k "test_chat_non_streaming_basic and earth"
|
||||||
```
|
```
|
||||||
|
|
||||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
|
||||||
|
**Model Key (Together)**
|
||||||
|
|
||||||
|
| Display Name | Full Model ID |
|
||||||
|
| --- | --- |
|
||||||
|
| Llama-3.3-70B-Instruct | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
|
||||||
|
| Llama-4-Maverick-Instruct | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` |
|
||||||
|
| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
|
||||||
|
|
||||||
|
|
||||||
|
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
|
||||||
| --- | --- | --- | --- |
|
| --- | --- | --- | --- |
|
||||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
|
||||||
| test_chat_streaming_basic (case 0) | ✅ | ❌ | ❌ |
|
| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
|
||||||
| test_chat_streaming_basic (case 1) | ✅ | ❌ | ❌ |
|
| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
|
||||||
| test_chat_streaming_image (case 0) | ⚪ | ❌ | ❌ |
|
| test_chat_streaming_image | ⚪ | ❌ | ❌ |
|
||||||
| test_chat_streaming_structured_output (case 0) | ✅ | ❌ | ❌ |
|
| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
|
||||||
| test_chat_streaming_structured_output (case 1) | ✅ | ❌ | ❌ |
|
| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
|
||||||
|
|
||||||
## Fireworks
|
## Fireworks
|
||||||
|
|
||||||
*Tests run on: 2025-04-08 16:18:28*
|
*Tests run on: 2025-04-09 22:50:02*
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pytest tests/verifications/openai/test_chat_completion.py --provider=fireworks -v
|
# Run all tests for this provider:
|
||||||
|
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -v
|
||||||
|
|
||||||
|
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||||
|
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -k "test_chat_non_streaming_basic and earth"
|
||||||
```
|
```
|
||||||
|
|
||||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
|
||||||
|
**Model Key (Fireworks)**
|
||||||
|
|
||||||
|
| Display Name | Full Model ID |
|
||||||
|
| --- | --- |
|
||||||
|
| Llama-3.3-70B-Instruct | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
|
||||||
|
| Llama-4-Maverick-Instruct | `accounts/fireworks/models/llama4-maverick-instruct-basic` |
|
||||||
|
| Llama-4-Scout-Instruct | `accounts/fireworks/models/llama4-scout-instruct-basic` |
|
||||||
|
|
||||||
|
|
||||||
|
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
|
||||||
| --- | --- | --- | --- |
|
| --- | --- | --- | --- |
|
||||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ❌ | ❌ |
|
| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
|
||||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
| test_chat_streaming_image | ⚪ | ✅ | ✅ |
|
||||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||||
| test_chat_streaming_structured_output (case 1) | ❌ | ✅ | ✅ |
|
| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||||
|
|
||||||
## Openai
|
## Openai
|
||||||
|
|
||||||
*Tests run on: 2025-04-08 16:22:02*
|
*Tests run on: 2025-04-09 22:51:44*
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pytest tests/verifications/openai/test_chat_completion.py --provider=openai -v
|
# Run all tests for this provider:
|
||||||
|
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -v
|
||||||
|
|
||||||
|
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||||
|
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -k "test_chat_non_streaming_basic and earth"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
**Model Key (Openai)**
|
||||||
|
|
||||||
|
| Display Name | Full Model ID |
|
||||||
|
| --- | --- |
|
||||||
|
| gpt-4o | `gpt-4o` |
|
||||||
|
| gpt-4o-mini | `gpt-4o-mini` |
|
||||||
|
|
||||||
|
|
||||||
| Test | gpt-4o | gpt-4o-mini |
|
| Test | gpt-4o | gpt-4o-mini |
|
||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ |
|
| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ |
|
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_image (case 0) | ✅ | ✅ |
|
| test_chat_non_streaming_image | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ |
|
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ |
|
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
|
||||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ |
|
| test_chat_non_streaming_tool_calling | ✅ | ✅ |
|
||||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ |
|
| test_chat_streaming_basic (earth) | ✅ | ✅ |
|
||||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ |
|
| test_chat_streaming_basic (saturn) | ✅ | ✅ |
|
||||||
| test_chat_streaming_image (case 0) | ✅ | ✅ |
|
| test_chat_streaming_image | ✅ | ✅ |
|
||||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ |
|
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||||
| test_chat_streaming_structured_output (case 1) | ✅ | ✅ |
|
| test_chat_streaming_structured_output (math) | ✅ | ✅ |
|
||||||
|
|
10
tests/verifications/conf/cerebras.yaml
Normal file
10
tests/verifications/conf/cerebras.yaml
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
base_url: https://api.cerebras.ai/v1
|
||||||
|
api_key_var: CEREBRAS_API_KEY
|
||||||
|
models:
|
||||||
|
- llama-3.3-70b
|
||||||
|
model_display_names:
|
||||||
|
llama-3.3-70b: Llama-3.3-70B-Instruct
|
||||||
|
test_exclusions:
|
||||||
|
llama-3.3-70b:
|
||||||
|
- test_chat_non_streaming_image
|
||||||
|
- test_chat_streaming_image
|
14
tests/verifications/conf/fireworks.yaml
Normal file
14
tests/verifications/conf/fireworks.yaml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
base_url: https://api.fireworks.ai/inference/v1
|
||||||
|
api_key_var: FIREWORKS_API_KEY
|
||||||
|
models:
|
||||||
|
- accounts/fireworks/models/llama-v3p3-70b-instruct
|
||||||
|
- accounts/fireworks/models/llama4-scout-instruct-basic
|
||||||
|
- accounts/fireworks/models/llama4-maverick-instruct-basic
|
||||||
|
model_display_names:
|
||||||
|
accounts/fireworks/models/llama-v3p3-70b-instruct: Llama-3.3-70B-Instruct
|
||||||
|
accounts/fireworks/models/llama4-scout-instruct-basic: Llama-4-Scout-Instruct
|
||||||
|
accounts/fireworks/models/llama4-maverick-instruct-basic: Llama-4-Maverick-Instruct
|
||||||
|
test_exclusions:
|
||||||
|
accounts/fireworks/models/llama-v3p3-70b-instruct:
|
||||||
|
- test_chat_non_streaming_image
|
||||||
|
- test_chat_streaming_image
|
14
tests/verifications/conf/groq.yaml
Normal file
14
tests/verifications/conf/groq.yaml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
base_url: https://api.groq.com/openai/v1
|
||||||
|
api_key_var: GROQ_API_KEY
|
||||||
|
models:
|
||||||
|
- llama-3.3-70b-versatile
|
||||||
|
- llama-4-scout-17b-16e-instruct
|
||||||
|
- llama-4-maverick-17b-128e-instruct
|
||||||
|
model_display_names:
|
||||||
|
llama-3.3-70b-versatile: Llama-3.3-70B-Instruct
|
||||||
|
llama-4-scout-17b-16e-instruct: Llama-4-Scout-Instruct
|
||||||
|
llama-4-maverick-17b-128e-instruct: Llama-4-Maverick-Instruct
|
||||||
|
test_exclusions:
|
||||||
|
llama-3.3-70b-versatile:
|
||||||
|
- test_chat_non_streaming_image
|
||||||
|
- test_chat_streaming_image
|
9
tests/verifications/conf/openai.yaml
Normal file
9
tests/verifications/conf/openai.yaml
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
base_url: https://api.openai.com/v1
|
||||||
|
api_key_var: OPENAI_API_KEY
|
||||||
|
models:
|
||||||
|
- gpt-4o
|
||||||
|
- gpt-4o-mini
|
||||||
|
model_display_names:
|
||||||
|
gpt-4o: gpt-4o
|
||||||
|
gpt-4o-mini: gpt-4o-mini
|
||||||
|
test_exclusions: {}
|
14
tests/verifications/conf/together.yaml
Normal file
14
tests/verifications/conf/together.yaml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
base_url: https://api.together.xyz/v1
|
||||||
|
api_key_var: TOGETHER_API_KEY
|
||||||
|
models:
|
||||||
|
- meta-llama/Llama-3.3-70B-Instruct-Turbo
|
||||||
|
- meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||||
|
- meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
|
||||||
|
model_display_names:
|
||||||
|
meta-llama/Llama-3.3-70B-Instruct-Turbo: Llama-3.3-70B-Instruct
|
||||||
|
meta-llama/Llama-4-Scout-17B-16E-Instruct: Llama-4-Scout-Instruct
|
||||||
|
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8: Llama-4-Maverick-Instruct
|
||||||
|
test_exclusions:
|
||||||
|
meta-llama/Llama-3.3-70B-Instruct-Turbo:
|
||||||
|
- test_chat_non_streaming_image
|
||||||
|
- test_chat_streaming_image
|
|
@ -4,6 +4,10 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
parser.addoption(
|
parser.addoption(
|
||||||
|
@ -14,7 +18,7 @@ def pytest_addoption(parser):
|
||||||
parser.addoption(
|
parser.addoption(
|
||||||
"--api-key",
|
"--api-key",
|
||||||
action="store",
|
action="store",
|
||||||
help="API key",
|
help="API key to use for the provider",
|
||||||
)
|
)
|
||||||
parser.addoption(
|
parser.addoption(
|
||||||
"--provider",
|
"--provider",
|
||||||
|
@ -24,5 +28,64 @@ def pytest_addoption(parser):
|
||||||
|
|
||||||
|
|
||||||
pytest_plugins = [
|
pytest_plugins = [
|
||||||
"tests.verifications.openai.fixtures.fixtures",
|
"pytest_jsonreport",
|
||||||
|
"tests.verifications.openai_api.fixtures.fixtures",
|
||||||
|
"tests.verifications.openai_api.fixtures.load",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.hookimpl(optionalhook=True)
|
||||||
|
def pytest_json_runtest_metadata(item, call):
|
||||||
|
"""Add model and case_id to pytest-json report metadata."""
|
||||||
|
metadata = {}
|
||||||
|
nodeid = item.nodeid
|
||||||
|
|
||||||
|
# 1. Extract model from callspec if available
|
||||||
|
model = item.callspec.params.get("model") if hasattr(item, "callspec") else None
|
||||||
|
if model:
|
||||||
|
metadata["model"] = model
|
||||||
|
else:
|
||||||
|
# Fallback: Try parsing from nodeid (less reliable)
|
||||||
|
match_model = re.search(r"\[(.*?)-", nodeid)
|
||||||
|
if match_model:
|
||||||
|
model = match_model.group(1) # Store model even if found via fallback
|
||||||
|
metadata["model"] = model
|
||||||
|
else:
|
||||||
|
print(f"Warning: Could not determine model for test {nodeid}")
|
||||||
|
model = None # Ensure model is None if not found
|
||||||
|
|
||||||
|
# 2. Extract case_id using the known model string if possible
|
||||||
|
if model:
|
||||||
|
# Construct a regex pattern to find the case_id *after* the model name and a hyphen.
|
||||||
|
# Escape the model name in case it contains regex special characters.
|
||||||
|
pattern = re.escape(model) + r"-(.*?)\]$"
|
||||||
|
match_case = re.search(pattern, nodeid)
|
||||||
|
if match_case:
|
||||||
|
case_id = match_case.group(1)
|
||||||
|
metadata["case_id"] = case_id
|
||||||
|
else:
|
||||||
|
# Fallback if the pattern didn't match (e.g., nodeid format unexpected)
|
||||||
|
# Try the old less specific regex as a last resort.
|
||||||
|
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
|
||||||
|
if match_case_fallback:
|
||||||
|
case_id = match_case_fallback.group(1)
|
||||||
|
metadata["case_id"] = case_id
|
||||||
|
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid}")
|
||||||
|
else:
|
||||||
|
print(f"Warning: Could not parse case_id from nodeid {nodeid} even with fallback.")
|
||||||
|
if "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
|
||||||
|
metadata["case_id"] = "parsing_failed"
|
||||||
|
elif "case" in (item.callspec.params if hasattr(item, "callspec") else {}):
|
||||||
|
# Cannot reliably parse case_id without model, but we know it's a case test.
|
||||||
|
# Try the generic fallback regex.
|
||||||
|
match_case_fallback = re.search(r"-(.*?)\]$", nodeid)
|
||||||
|
if match_case_fallback:
|
||||||
|
case_id = match_case_fallback.group(1)
|
||||||
|
metadata["case_id"] = case_id
|
||||||
|
print(f"Warning: Used fallback regex to parse case_id from nodeid {nodeid} (model unknown)")
|
||||||
|
else:
|
||||||
|
print(f"Warning: Could not parse case_id from nodeid {nodeid} (model unknown)")
|
||||||
|
metadata["case_id"] = "parsing_failed_no_model"
|
||||||
|
# else: Not a test with a model or case param we need to handle.
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
|
@ -4,27 +4,48 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
# /// script
|
||||||
|
# requires-python = ">=3.10"
|
||||||
|
# dependencies = [
|
||||||
|
# "pytest-json-report",
|
||||||
|
# "pyyaml",
|
||||||
|
# ]
|
||||||
|
# ///
|
||||||
"""
|
"""
|
||||||
Test Report Generator
|
Test Report Generator
|
||||||
|
|
||||||
Requirements:
|
Description:
|
||||||
pip install pytest-json-report
|
This script runs pytest tests (specifically designed for OpenAI API compatibility checks)
|
||||||
|
for different providers, aggregates the results from JSON reports, and generates
|
||||||
|
a markdown summary report (REPORT.md).
|
||||||
|
|
||||||
|
It automatically cleans up old test result files, keeping only the latest
|
||||||
|
per provider.
|
||||||
|
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
- Provider details (models, display names) are loaded from `tests/verifications/config.yaml`.
|
||||||
|
- Test cases are defined in YAML files within `tests/verifications/openai_api/fixtures/test_cases/`.
|
||||||
|
- Test results are stored in `tests/verifications/test_results/`.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
# Generate a report using existing test results
|
# Generate a report using the latest existing test results
|
||||||
python tests/verifications/generate_report.py
|
python tests/verifications/generate_report.py
|
||||||
|
|
||||||
# Run tests and generate a report
|
# Run tests for all configured providers and generate a report
|
||||||
python tests/verifications/generate_report.py --run-tests
|
python tests/verifications/generate_report.py --run-tests
|
||||||
|
|
||||||
# Run tests for specific providers
|
# Run tests only for specific providers (space-separated)
|
||||||
python tests/verifications/generate_report.py --run-tests --providers fireworks openai
|
python tests/verifications/generate_report.py --run-tests --providers fireworks openai
|
||||||
|
|
||||||
|
# Run tests matching a keyword expression (uses pytest -k)
|
||||||
|
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "streaming"
|
||||||
|
|
||||||
|
# Run a specific test case for a provider
|
||||||
|
python tests/verifications/generate_report.py --run-tests --providers fireworks --k "test_chat_streaming_basic and basic_earth"
|
||||||
|
|
||||||
# Save the report to a custom location
|
# Save the report to a custom location
|
||||||
python tests/verifications/generate_report.py --output custom_report.md
|
python tests/verifications/generate_report.py --output custom_report.md
|
||||||
|
|
||||||
# Clean up old test result files
|
|
||||||
python tests/verifications/generate_report.py --cleanup
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -35,6 +56,9 @@ import subprocess
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any, DefaultDict, Dict, Set, Tuple
|
||||||
|
|
||||||
|
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
|
||||||
|
|
||||||
# Define the root directory for test results
|
# Define the root directory for test results
|
||||||
RESULTS_DIR = Path(__file__).parent / "test_results"
|
RESULTS_DIR = Path(__file__).parent / "test_results"
|
||||||
|
@ -43,17 +67,12 @@ RESULTS_DIR.mkdir(exist_ok=True)
|
||||||
# Maximum number of test result files to keep per provider
|
# Maximum number of test result files to keep per provider
|
||||||
MAX_RESULTS_PER_PROVIDER = 1
|
MAX_RESULTS_PER_PROVIDER = 1
|
||||||
|
|
||||||
# Custom order of providers
|
|
||||||
PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"]
|
PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"]
|
||||||
|
|
||||||
# Dictionary to store providers and their models (will be populated dynamically)
|
VERIFICATION_CONFIG = _load_all_verification_configs()
|
||||||
PROVIDERS = defaultdict(set)
|
|
||||||
|
|
||||||
# Tests will be dynamically extracted from results
|
|
||||||
ALL_TESTS = set()
|
|
||||||
|
|
||||||
|
|
||||||
def run_tests(provider):
|
def run_tests(provider, keyword=None):
|
||||||
"""Run pytest for a specific provider and save results"""
|
"""Run pytest for a specific provider and save results"""
|
||||||
print(f"Running tests for provider: {provider}")
|
print(f"Running tests for provider: {provider}")
|
||||||
|
|
||||||
|
@ -61,20 +80,28 @@ def run_tests(provider):
|
||||||
result_file = RESULTS_DIR / f"{provider}_{timestamp}.json"
|
result_file = RESULTS_DIR / f"{provider}_{timestamp}.json"
|
||||||
temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json"
|
temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json"
|
||||||
|
|
||||||
|
# Determine project root directory relative to this script
|
||||||
|
project_root = Path(__file__).parent.parent.parent
|
||||||
|
|
||||||
# Run pytest with JSON output
|
# Run pytest with JSON output
|
||||||
cmd = [
|
cmd = [
|
||||||
"python",
|
"python",
|
||||||
"-m",
|
"-m",
|
||||||
"pytest",
|
"pytest",
|
||||||
"tests/verifications/openai/test_chat_completion.py",
|
"tests/verifications/openai_api/test_chat_completion.py",
|
||||||
f"--provider={provider}",
|
f"--provider={provider}",
|
||||||
"-v",
|
"-v",
|
||||||
"--json-report",
|
"--json-report",
|
||||||
f"--json-report-file={temp_json_file}",
|
f"--json-report-file={temp_json_file}",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Append -k argument if provided
|
||||||
|
if keyword:
|
||||||
|
cmd.extend(["-k", keyword])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
# Run subprocess with cwd set to project root
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True, cwd=project_root)
|
||||||
print(f"Pytest exit code: {result.returncode}")
|
print(f"Pytest exit code: {result.returncode}")
|
||||||
|
|
||||||
# Check if the JSON file was created
|
# Check if the JSON file was created
|
||||||
|
@ -103,18 +130,30 @@ def run_tests(provider):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def parse_results(result_file):
|
def parse_results(
|
||||||
"""Parse the test results file and extract pass/fail by model and test"""
|
result_file,
|
||||||
|
) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str]]:
|
||||||
|
"""Parse a single test results file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple containing:
|
||||||
|
- parsed_results: DefaultDict[provider, DefaultDict[model, Dict[test_name, pass_status]]]
|
||||||
|
- providers_in_file: DefaultDict[provider, Set[model]] found in this file.
|
||||||
|
- tests_in_file: Set[test_name] found in this file.
|
||||||
|
"""
|
||||||
if not os.path.exists(result_file):
|
if not os.path.exists(result_file):
|
||||||
print(f"Results file does not exist: {result_file}")
|
print(f"Results file does not exist: {result_file}")
|
||||||
return {}
|
# Return empty defaultdicts/set matching the type hint
|
||||||
|
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set()
|
||||||
|
|
||||||
with open(result_file, "r") as f:
|
with open(result_file, "r") as f:
|
||||||
results = json.load(f)
|
results = json.load(f)
|
||||||
|
|
||||||
# Initialize results dictionary
|
# Initialize results dictionary with specific types
|
||||||
parsed_results = defaultdict(lambda: defaultdict(dict))
|
parsed_results: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] = defaultdict(lambda: defaultdict(dict))
|
||||||
provider = os.path.basename(result_file).split("_")[0]
|
providers_in_file: DefaultDict[str, Set[str]] = defaultdict(set)
|
||||||
|
tests_in_file: Set[str] = set()
|
||||||
|
provider: str = os.path.basename(result_file).split("_")[0]
|
||||||
|
|
||||||
# Debug: Print summary of test results
|
# Debug: Print summary of test results
|
||||||
print(f"Test results summary for {provider}:")
|
print(f"Test results summary for {provider}:")
|
||||||
|
@ -127,124 +166,72 @@ def parse_results(result_file):
|
||||||
# Extract test results
|
# Extract test results
|
||||||
if "tests" not in results or not results["tests"]:
|
if "tests" not in results or not results["tests"]:
|
||||||
print(f"No test results found in {result_file}")
|
print(f"No test results found in {result_file}")
|
||||||
return parsed_results
|
# Return empty defaultdicts/set matching the type hint
|
||||||
|
return defaultdict(lambda: defaultdict(dict)), defaultdict(set), set()
|
||||||
|
|
||||||
# Map for normalizing model names
|
# Process the tests
|
||||||
model_name_map = {
|
|
||||||
"Llama-3.3-8B-Instruct": "Llama-3.3-8B-Instruct",
|
|
||||||
"Llama-3.3-70B-Instruct": "Llama-3.3-70B-Instruct",
|
|
||||||
"Llama-3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct",
|
|
||||||
"Llama-4-Scout-17B-16E": "Llama-4-Scout-17B-16E-Instruct",
|
|
||||||
"Llama-4-Scout-17B-16E-Instruct": "Llama-4-Scout-17B-16E-Instruct",
|
|
||||||
"Llama-4-Maverick-17B-128E": "Llama-4-Maverick-17B-128E-Instruct",
|
|
||||||
"Llama-4-Maverick-17B-128E-Instruct": "Llama-4-Maverick-17B-128E-Instruct",
|
|
||||||
"gpt-4o": "gpt-4o",
|
|
||||||
"gpt-4o-mini": "gpt-4o-mini",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Keep track of all models found for this provider
|
|
||||||
provider_models = set()
|
|
||||||
|
|
||||||
# Track all unique test cases for each base test
|
|
||||||
test_case_counts = defaultdict(int)
|
|
||||||
|
|
||||||
# First pass: count the number of cases for each test
|
|
||||||
for test in results["tests"]:
|
for test in results["tests"]:
|
||||||
test_id = test.get("nodeid", "")
|
test_id = test.get("nodeid", "")
|
||||||
|
|
||||||
if "call" in test:
|
if not (call_phase := test.get("call")):
|
||||||
test_name = test_id.split("::")[1].split("[")[0]
|
continue
|
||||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
call_outcome = call_phase.get("outcome")
|
||||||
if input_output_match:
|
if call_outcome not in ("passed", "failed"):
|
||||||
test_case_counts[test_name] += 1
|
continue
|
||||||
|
|
||||||
# Second pass: process the tests with case numbers only for tests with multiple cases
|
# --- Extract data from metadata ---
|
||||||
for test in results["tests"]:
|
metadata = test.get("metadata", {})
|
||||||
test_id = test.get("nodeid", "")
|
model = metadata.get("model")
|
||||||
outcome = test.get("outcome", "")
|
case_id = metadata.get("case_id") # String ID (if provided)
|
||||||
|
case_index = metadata.get("case_index") # Integer index (if no ID provided)
|
||||||
|
|
||||||
# Only process tests that have been executed (not setup errors)
|
# Check if we have a model and at least one case identifier
|
||||||
if "call" in test:
|
if not model or (case_id is None and case_index is None):
|
||||||
# Regular test that actually ran
|
print(
|
||||||
test_name = test_id.split("::")[1].split("[")[0]
|
f"Warning: Missing 'model' or case identifier ('case_id'/'case_index') metadata for test: {test_id}. Skipping."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
# Extract input_output parameter to differentiate between test cases
|
try:
|
||||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
test_name_base = test_id.split("::")[1].split("[")[0]
|
||||||
input_output_index = input_output_match.group(1) if input_output_match else ""
|
except (IndexError, ValueError) as e:
|
||||||
|
print(f"Warning: Could not parse base test name for {test_id}. Error: {e}. Skipping.")
|
||||||
|
continue
|
||||||
|
|
||||||
# Create a more detailed test name with case number only if there are multiple cases
|
# Construct detailed test name using ID or index
|
||||||
detailed_test_name = test_name
|
if case_id is not None:
|
||||||
if input_output_index and test_case_counts[test_name] > 1:
|
detailed_test_name = f"{test_name_base} ({case_id})"
|
||||||
detailed_test_name = f"{test_name} (case {input_output_index})"
|
elif case_index == 0:
|
||||||
|
# If case_id is missing and index is 0, assume single case, use base name only
|
||||||
|
detailed_test_name = test_name_base
|
||||||
|
elif case_index is not None: # case_index > 0
|
||||||
|
# Use case_index for naming if case_id wasn't provided and index > 0
|
||||||
|
detailed_test_name = f"{test_name_base} (case{case_index})"
|
||||||
|
else:
|
||||||
|
# This case should be prevented by the earlier check, but handle defensively
|
||||||
|
print(f"Error: No case identifier found for test {test_id} after initial check. Skipping.")
|
||||||
|
continue
|
||||||
|
|
||||||
# Track all unique test names
|
# Populate collections for this file
|
||||||
ALL_TESTS.add(detailed_test_name)
|
tests_in_file.add(detailed_test_name)
|
||||||
|
providers_in_file[provider].add(model)
|
||||||
|
|
||||||
# Extract model name from test_id using a more robust pattern
|
if call_outcome == "passed":
|
||||||
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
|
parsed_results[provider][model][detailed_test_name] = True
|
||||||
if model_match:
|
elif call_outcome == "failed":
|
||||||
raw_model = model_match.group(1)
|
parsed_results[provider][model][detailed_test_name] = False
|
||||||
model = model_name_map.get(raw_model, raw_model)
|
|
||||||
|
|
||||||
# Add to set of known models for this provider
|
# Final Summary Warning (Optional)
|
||||||
provider_models.add(model)
|
if not parsed_results.get(provider):
|
||||||
|
print(f"Warning: No valid test results parsed for provider {provider} from file {result_file}")
|
||||||
|
|
||||||
# Also update the global PROVIDERS dictionary
|
return parsed_results, providers_in_file, tests_in_file
|
||||||
PROVIDERS[provider].add(model)
|
|
||||||
|
|
||||||
# Store the result
|
|
||||||
if outcome == "passed":
|
|
||||||
parsed_results[provider][model][detailed_test_name] = True
|
|
||||||
else:
|
|
||||||
parsed_results[provider][model][detailed_test_name] = False
|
|
||||||
|
|
||||||
print(f"Parsed test result: {detailed_test_name} for model {model}: {outcome}")
|
|
||||||
elif outcome == "error" and "setup" in test and test.get("setup", {}).get("outcome") == "failed":
|
|
||||||
# This is a setup failure, which likely means a configuration issue
|
|
||||||
# Extract the base test name and model name
|
|
||||||
parts = test_id.split("::")
|
|
||||||
if len(parts) > 1:
|
|
||||||
test_name = parts[1].split("[")[0]
|
|
||||||
|
|
||||||
# Extract input_output parameter to differentiate between test cases
|
|
||||||
input_output_match = re.search(r"\[input_output(\d+)-", test_id)
|
|
||||||
input_output_index = input_output_match.group(1) if input_output_match else ""
|
|
||||||
|
|
||||||
# Create a more detailed test name with case number only if there are multiple cases
|
|
||||||
detailed_test_name = test_name
|
|
||||||
if input_output_index and test_case_counts[test_name] > 1:
|
|
||||||
detailed_test_name = f"{test_name} (case {input_output_index})"
|
|
||||||
|
|
||||||
if detailed_test_name in ALL_TESTS:
|
|
||||||
# Use a more robust pattern for model extraction
|
|
||||||
model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id)
|
|
||||||
if model_match:
|
|
||||||
raw_model = model_match.group(1)
|
|
||||||
model = model_name_map.get(raw_model, raw_model)
|
|
||||||
|
|
||||||
# Add to set of known models for this provider
|
|
||||||
provider_models.add(model)
|
|
||||||
|
|
||||||
# Also update the global PROVIDERS dictionary
|
|
||||||
PROVIDERS[provider].add(model)
|
|
||||||
|
|
||||||
# Mark setup failures as false (failed)
|
|
||||||
parsed_results[provider][model][detailed_test_name] = False
|
|
||||||
print(f"Parsed setup failure: {detailed_test_name} for model {model}")
|
|
||||||
|
|
||||||
# Debug: Print parsed results
|
|
||||||
if not parsed_results[provider]:
|
|
||||||
print(f"Warning: No test results parsed for provider {provider}")
|
|
||||||
else:
|
|
||||||
for model, tests in parsed_results[provider].items():
|
|
||||||
print(f"Model {model}: {len(tests)} test results")
|
|
||||||
|
|
||||||
return parsed_results
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_old_results():
|
def cleanup_old_results(providers_to_clean: Dict[str, Set[str]]):
|
||||||
"""Clean up old test result files, keeping only the newest N per provider"""
|
"""Clean up old test result files, keeping only the newest N per provider."""
|
||||||
for provider in PROVIDERS.keys():
|
# Use the passed-in providers dictionary
|
||||||
|
for provider in providers_to_clean.keys():
|
||||||
# Get all result files for this provider
|
# Get all result files for this provider
|
||||||
provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json"))
|
provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json"))
|
||||||
|
|
||||||
|
@ -289,8 +276,17 @@ def get_latest_results_by_provider():
|
||||||
return provider_results
|
return provider_results
|
||||||
|
|
||||||
|
|
||||||
def generate_report(results_dict, output_file=None):
|
def generate_report(
|
||||||
"""Generate the markdown report"""
|
results_dict: Dict[str, Any], providers: Dict[str, Set[str]], all_tests: Set[str], output_file=None
|
||||||
|
):
|
||||||
|
"""Generate the markdown report.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
results_dict: Aggregated results [provider][model][test_name] -> status.
|
||||||
|
providers: Dict of all providers and their models {provider: {models}}.
|
||||||
|
all_tests: Set of all test names found.
|
||||||
|
output_file: Optional path to save the report.
|
||||||
|
"""
|
||||||
if output_file is None:
|
if output_file is None:
|
||||||
# Default to creating the report in the same directory as this script
|
# Default to creating the report in the same directory as this script
|
||||||
output_file = Path(__file__).parent / "REPORT.md"
|
output_file = Path(__file__).parent / "REPORT.md"
|
||||||
|
@ -299,8 +295,8 @@ def generate_report(results_dict, output_file=None):
|
||||||
|
|
||||||
# Get the timestamp from result files
|
# Get the timestamp from result files
|
||||||
provider_timestamps = {}
|
provider_timestamps = {}
|
||||||
provider_results = get_latest_results_by_provider()
|
provider_results_files = get_latest_results_by_provider()
|
||||||
for provider, result_file in provider_results.items():
|
for provider, result_file in provider_results_files.items():
|
||||||
# Extract timestamp from filename (format: provider_timestamp.json)
|
# Extract timestamp from filename (format: provider_timestamp.json)
|
||||||
try:
|
try:
|
||||||
timestamp_str = result_file.stem.split("_")[1]
|
timestamp_str = result_file.stem.split("_")[1]
|
||||||
|
@ -310,12 +306,33 @@ def generate_report(results_dict, output_file=None):
|
||||||
except (IndexError, ValueError):
|
except (IndexError, ValueError):
|
||||||
provider_timestamps[provider] = "Unknown"
|
provider_timestamps[provider] = "Unknown"
|
||||||
|
|
||||||
# Convert provider model sets to sorted lists
|
# Convert provider model sets to sorted lists (use passed-in providers dict)
|
||||||
for provider in PROVIDERS:
|
providers_sorted = {prov: sorted(models) for prov, models in providers.items()}
|
||||||
PROVIDERS[provider] = sorted(PROVIDERS[provider])
|
|
||||||
|
|
||||||
# Sort tests alphabetically
|
# Sort tests alphabetically (use passed-in all_tests set)
|
||||||
sorted_tests = sorted(ALL_TESTS)
|
sorted_tests = sorted(all_tests)
|
||||||
|
|
||||||
|
# Calculate counts for each base test name
|
||||||
|
base_test_case_counts: DefaultDict[str, int] = defaultdict(int)
|
||||||
|
base_test_name_map: Dict[str, str] = {}
|
||||||
|
for test_name in sorted_tests:
|
||||||
|
match = re.match(r"^(.*?)( \([^)]+\))?$", test_name)
|
||||||
|
if match:
|
||||||
|
base_name = match.group(1).strip()
|
||||||
|
base_test_case_counts[base_name] += 1
|
||||||
|
base_test_name_map[test_name] = base_name
|
||||||
|
else:
|
||||||
|
# Should not happen with current naming, but handle defensively
|
||||||
|
base_test_case_counts[test_name] += 1
|
||||||
|
base_test_name_map[test_name] = test_name
|
||||||
|
|
||||||
|
if not sorted_tests:
|
||||||
|
print("Warning: No test results found to generate a report.")
|
||||||
|
# Optionally create an empty report or return early
|
||||||
|
with open(output_file, "w") as f:
|
||||||
|
f.write("# Test Results Report\n\nNo test results found.\n")
|
||||||
|
print(f"Generated empty report: {output_file}")
|
||||||
|
return
|
||||||
|
|
||||||
report = ["# Test Results Report\n"]
|
report = ["# Test Results Report\n"]
|
||||||
report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
|
report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
|
||||||
|
@ -336,19 +353,15 @@ def generate_report(results_dict, output_file=None):
|
||||||
# Add a summary section
|
# Add a summary section
|
||||||
report.append("## Summary\n")
|
report.append("## Summary\n")
|
||||||
|
|
||||||
# Count total tests and passes
|
# Count total tests and passes (use passed-in providers and all_tests)
|
||||||
total_tests = 0
|
total_tests = 0
|
||||||
passed_tests = 0
|
passed_tests = 0
|
||||||
provider_totals = {}
|
provider_totals = {}
|
||||||
|
for provider, models in providers_sorted.items():
|
||||||
# Prepare summary data
|
|
||||||
for provider in PROVIDERS.keys():
|
|
||||||
provider_passed = 0
|
provider_passed = 0
|
||||||
provider_total = 0
|
provider_total = 0
|
||||||
|
|
||||||
if provider in results_dict:
|
if provider in results_dict:
|
||||||
provider_models = PROVIDERS[provider]
|
for model in models:
|
||||||
for model in provider_models:
|
|
||||||
if model in results_dict[provider]:
|
if model in results_dict[provider]:
|
||||||
model_results = results_dict[provider][model]
|
model_results = results_dict[provider][model]
|
||||||
for test in sorted_tests:
|
for test in sorted_tests:
|
||||||
|
@ -358,33 +371,26 @@ def generate_report(results_dict, output_file=None):
|
||||||
if model_results[test]:
|
if model_results[test]:
|
||||||
provider_passed += 1
|
provider_passed += 1
|
||||||
passed_tests += 1
|
passed_tests += 1
|
||||||
|
|
||||||
provider_totals[provider] = (provider_passed, provider_total)
|
provider_totals[provider] = (provider_passed, provider_total)
|
||||||
|
|
||||||
# Add summary table
|
# Add summary table (use passed-in providers dict)
|
||||||
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
|
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
|
||||||
report.append("| --- | --- | --- | --- |")
|
report.append("| --- | --- | --- | --- |")
|
||||||
|
for provider in [p for p in PROVIDER_ORDER if p in providers]: # Check against keys of passed-in dict
|
||||||
# Use the custom order for summary table
|
|
||||||
for provider in [p for p in PROVIDER_ORDER if p in PROVIDERS]:
|
|
||||||
passed, total = provider_totals.get(provider, (0, 0))
|
passed, total = provider_totals.get(provider, (0, 0))
|
||||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||||
|
for provider in [p for p in providers if p not in PROVIDER_ORDER]: # Check against keys of passed-in dict
|
||||||
# Add providers not in the custom order
|
|
||||||
for provider in [p for p in PROVIDERS if p not in PROVIDER_ORDER]:
|
|
||||||
passed, total = provider_totals.get(provider, (0, 0))
|
passed, total = provider_totals.get(provider, (0, 0))
|
||||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||||
|
|
||||||
report.append("\n")
|
report.append("\n")
|
||||||
|
|
||||||
# Process each provider in the custom order, then any additional providers
|
|
||||||
for provider in sorted(
|
for provider in sorted(
|
||||||
PROVIDERS.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
|
providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
|
||||||
):
|
):
|
||||||
if not PROVIDERS[provider]:
|
provider_models = providers_sorted[provider] # Use sorted models
|
||||||
# Skip providers with no models
|
if not provider_models:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
report.append(f"\n## {provider.capitalize()}\n")
|
report.append(f"\n## {provider.capitalize()}\n")
|
||||||
|
@ -394,34 +400,70 @@ def generate_report(results_dict, output_file=None):
|
||||||
report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")
|
report.append(f"*Tests run on: {provider_timestamps[provider]}*\n")
|
||||||
|
|
||||||
# Add test command for reproducing results
|
# Add test command for reproducing results
|
||||||
test_cmd = f"pytest tests/verifications/openai/test_chat_completion.py --provider={provider} -v"
|
test_cmd_all = f"pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -v"
|
||||||
report.append(f"```bash\n{test_cmd}\n```\n")
|
report.append(f"```bash\n# Run all tests for this provider:\n{test_cmd_all}\n")
|
||||||
|
|
||||||
# Get the relevant models for this provider
|
# Find an example test with a case ID
|
||||||
provider_models = PROVIDERS[provider]
|
example_base_test_name = None
|
||||||
|
example_case_id = None
|
||||||
|
# Get first test as fallback base, handle empty list
|
||||||
|
first_test_name = sorted_tests[0] if sorted_tests else "unknown_test"
|
||||||
|
|
||||||
# Create table header with models as columns
|
match = re.match(r"^(.*?) \((.*?)\)$", first_test_name)
|
||||||
header = "| Test | " + " | ".join(provider_models) + " |"
|
if match:
|
||||||
|
example_base_test_name = match.group(1).strip()
|
||||||
|
example_case_id = match.group(2).strip()
|
||||||
|
else:
|
||||||
|
example_base_test_name = first_test_name
|
||||||
|
|
||||||
|
base_name = base_test_name_map.get(test, test) # Get base name
|
||||||
|
case_count = base_test_case_counts.get(base_name, 1) # Get count
|
||||||
|
filter_str = f"{example_base_test_name} and {example_case_id}" if case_count > 1 else example_base_test_name
|
||||||
|
|
||||||
|
test_cmd_specific_case = (
|
||||||
|
f'pytest tests/verifications/openai_api/test_chat_completion.py --provider={provider} -k "{filter_str}"'
|
||||||
|
)
|
||||||
|
report.append(
|
||||||
|
f"# Example: Run only the '{example_case_id}' case of {example_base_test_name}:\n{test_cmd_specific_case}\n```\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get display names (use passed-in providers dict)
|
||||||
|
provider_config = VERIFICATION_CONFIG.get("providers", {}).get(provider, {})
|
||||||
|
display_name_map = provider_config.get("model_display_names", {})
|
||||||
|
|
||||||
|
# Add Model Key Table (use provider_models)
|
||||||
|
report.append(f"\n**Model Key ({provider.capitalize()})**\n")
|
||||||
|
provider_key_lines = ["| Display Name | Full Model ID |", "| --- | --- |"]
|
||||||
|
for model_id in provider_models:
|
||||||
|
display_name = display_name_map.get(model_id, model_id)
|
||||||
|
provider_key_lines.append(f"| {display_name} | `{model_id}` |")
|
||||||
|
report.extend(provider_key_lines)
|
||||||
|
report.append("\n")
|
||||||
|
|
||||||
|
# Create results table header (use provider_models)
|
||||||
|
display_names = [display_name_map.get(m, m) for m in provider_models]
|
||||||
|
header = "| Test | " + " | ".join(display_names) + " |"
|
||||||
separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
|
separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |"
|
||||||
|
|
||||||
report.append(header)
|
report.append(header)
|
||||||
report.append(separator)
|
report.append(separator)
|
||||||
|
|
||||||
# Get results for this provider
|
# Get results for this provider from results_dict
|
||||||
provider_results = results_dict.get(provider, {})
|
provider_results_data = results_dict.get(provider, {})
|
||||||
|
|
||||||
# Add rows for each test
|
# Add rows for each test (use sorted_tests)
|
||||||
for test in sorted_tests:
|
for test in sorted_tests:
|
||||||
row = f"| {test} |"
|
# Determine display name based on case count
|
||||||
|
base_name = base_test_name_map.get(test, test) # Get base name
|
||||||
|
case_count = base_test_case_counts.get(base_name, 1) # Get count
|
||||||
|
display_test_name = base_name if case_count == 1 else test # Choose display name
|
||||||
|
row = f"| {display_test_name} |" # Use display name
|
||||||
|
|
||||||
# Add results for each model in this test
|
for model_id in provider_models:
|
||||||
for model in provider_models:
|
if model_id in provider_results_data and test in provider_results_data[model_id]:
|
||||||
if model in provider_results and test in provider_results[model]:
|
result = pass_icon if provider_results_data[model_id][test] else fail_icon
|
||||||
result = pass_icon if provider_results[model][test] else fail_icon
|
|
||||||
else:
|
else:
|
||||||
result = na_icon
|
result = na_icon
|
||||||
row += f" {result} |"
|
row += f" {result} |"
|
||||||
|
|
||||||
report.append(row)
|
report.append(row)
|
||||||
|
|
||||||
# Write to file
|
# Write to file
|
||||||
|
@ -442,9 +484,13 @@ def main():
|
||||||
help="Specify providers to test (comma-separated or space-separated, default: all)",
|
help="Specify providers to test (comma-separated or space-separated, default: all)",
|
||||||
)
|
)
|
||||||
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
|
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
|
||||||
|
parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
all_results = {}
|
all_results = {}
|
||||||
|
# Initialize collections to aggregate results in main
|
||||||
|
aggregated_providers = defaultdict(set)
|
||||||
|
aggregated_tests = set()
|
||||||
|
|
||||||
if args.run_tests:
|
if args.run_tests:
|
||||||
# Get list of available providers from command line or use detected providers
|
# Get list of available providers from command line or use detected providers
|
||||||
|
@ -463,22 +509,31 @@ def main():
|
||||||
|
|
||||||
for provider in test_providers:
|
for provider in test_providers:
|
||||||
provider = provider.strip() # Remove any whitespace
|
provider = provider.strip() # Remove any whitespace
|
||||||
result_file = run_tests(provider)
|
result_file = run_tests(provider, keyword=args.k)
|
||||||
if result_file:
|
if result_file:
|
||||||
provider_results = parse_results(result_file)
|
# Parse and aggregate results
|
||||||
all_results.update(provider_results)
|
parsed_results, providers_in_file, tests_in_file = parse_results(result_file)
|
||||||
|
all_results.update(parsed_results)
|
||||||
|
for prov, models in providers_in_file.items():
|
||||||
|
aggregated_providers[prov].update(models)
|
||||||
|
aggregated_tests.update(tests_in_file)
|
||||||
else:
|
else:
|
||||||
# Use existing results
|
# Use existing results
|
||||||
provider_result_files = get_latest_results_by_provider()
|
provider_result_files = get_latest_results_by_provider()
|
||||||
|
|
||||||
for result_file in provider_result_files.values():
|
for result_file in provider_result_files.values():
|
||||||
provider_results = parse_results(result_file)
|
# Parse and aggregate results
|
||||||
all_results.update(provider_results)
|
parsed_results, providers_in_file, tests_in_file = parse_results(result_file)
|
||||||
|
all_results.update(parsed_results)
|
||||||
|
for prov, models in providers_in_file.items():
|
||||||
|
aggregated_providers[prov].update(models)
|
||||||
|
aggregated_tests.update(tests_in_file)
|
||||||
|
|
||||||
# Generate the report
|
# Generate the report, passing aggregated data
|
||||||
generate_report(all_results, args.output)
|
generate_report(all_results, aggregated_providers, aggregated_tests, args.output)
|
||||||
|
|
||||||
cleanup_old_results()
|
# Cleanup, passing aggregated providers
|
||||||
|
cleanup_old_results(aggregated_providers)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,97 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def providers_model_mapping():
|
|
||||||
"""
|
|
||||||
Mapping from model names used in test cases to provider's model names.
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
"fireworks": {
|
|
||||||
"Llama-3.3-70B-Instruct": "accounts/fireworks/models/llama-v3p1-70b-instruct",
|
|
||||||
"Llama-3.2-11B-Vision-Instruct": "accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
|
|
||||||
"Llama-4-Scout-17B-16E-Instruct": "accounts/fireworks/models/llama4-scout-instruct-basic",
|
|
||||||
"Llama-4-Maverick-17B-128E-Instruct": "accounts/fireworks/models/llama4-maverick-instruct-basic",
|
|
||||||
},
|
|
||||||
"together": {
|
|
||||||
"Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
||||||
"Llama-3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
|
|
||||||
"Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
||||||
"Llama-4-Maverick-17B-128E-Instruct": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
||||||
},
|
|
||||||
"groq": {
|
|
||||||
"Llama-3.3-70B-Instruct": "llama-3.3-70b-versatile",
|
|
||||||
"Llama-3.2-11B-Vision-Instruct": "llama-3.2-11b-vision-preview",
|
|
||||||
"Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct",
|
|
||||||
"Llama-4-Maverick-17B-128E-Instruct": "llama-4-maverick-17b-128e-instruct",
|
|
||||||
},
|
|
||||||
"cerebras": {
|
|
||||||
"Llama-3.3-70B-Instruct": "llama-3.3-70b",
|
|
||||||
},
|
|
||||||
"openai": {
|
|
||||||
"gpt-4o": "gpt-4o",
|
|
||||||
"gpt-4o-mini": "gpt-4o-mini",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def provider_metadata():
|
|
||||||
return {
|
|
||||||
"fireworks": ("https://api.fireworks.ai/inference/v1", "FIREWORKS_API_KEY"),
|
|
||||||
"together": ("https://api.together.xyz/v1", "TOGETHER_API_KEY"),
|
|
||||||
"groq": ("https://api.groq.com/openai/v1", "GROQ_API_KEY"),
|
|
||||||
"cerebras": ("https://api.cerebras.ai/v1", "CEREBRAS_API_KEY"),
|
|
||||||
"openai": ("https://api.openai.com/v1", "OPENAI_API_KEY"),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def provider(request, provider_metadata):
|
|
||||||
provider = request.config.getoption("--provider")
|
|
||||||
base_url = request.config.getoption("--base-url")
|
|
||||||
|
|
||||||
if provider and base_url and provider_metadata[provider][0] != base_url:
|
|
||||||
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
|
|
||||||
|
|
||||||
if not provider:
|
|
||||||
if not base_url:
|
|
||||||
raise ValueError("Provider and base URL are not provided")
|
|
||||||
for provider, metadata in provider_metadata.items():
|
|
||||||
if metadata[0] == base_url:
|
|
||||||
provider = provider
|
|
||||||
break
|
|
||||||
|
|
||||||
return provider
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def base_url(request, provider, provider_metadata):
|
|
||||||
return request.config.getoption("--base-url") or provider_metadata[provider][0]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def api_key(request, provider, provider_metadata):
|
|
||||||
return request.config.getoption("--api-key") or os.getenv(provider_metadata[provider][1])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def model_mapping(provider, providers_model_mapping):
|
|
||||||
return providers_model_mapping[provider]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def openai_client(base_url, api_key):
|
|
||||||
return OpenAI(
|
|
||||||
base_url=base_url,
|
|
||||||
api_key=api_key,
|
|
||||||
)
|
|
|
@ -1,202 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
from tests.verifications.openai.fixtures.load import load_test_cases
|
|
||||||
|
|
||||||
chat_completion_test_cases = load_test_cases("chat_completion")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def correct_model_name(model, provider, providers_model_mapping):
|
|
||||||
"""Return the provider-specific model name based on the generic model name."""
|
|
||||||
mapping = providers_model_mapping[provider]
|
|
||||||
if model not in mapping:
|
|
||||||
pytest.skip(f"Provider {provider} does not support model {model}")
|
|
||||||
return mapping[model]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_output",
|
|
||||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
|
|
||||||
)
|
|
||||||
def test_chat_non_streaming_basic(openai_client, input_output, correct_model_name):
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=correct_model_name,
|
|
||||||
messages=input_output["input"]["messages"],
|
|
||||||
stream=False,
|
|
||||||
)
|
|
||||||
assert response.choices[0].message.role == "assistant"
|
|
||||||
assert input_output["output"].lower() in response.choices[0].message.content.lower()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"])
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_output",
|
|
||||||
chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"],
|
|
||||||
)
|
|
||||||
def test_chat_streaming_basic(openai_client, input_output, correct_model_name):
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=correct_model_name,
|
|
||||||
messages=input_output["input"]["messages"],
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
content = ""
|
|
||||||
for chunk in response:
|
|
||||||
content += chunk.choices[0].delta.content or ""
|
|
||||||
|
|
||||||
# TODO: add detailed type validation
|
|
||||||
|
|
||||||
assert input_output["output"].lower() in content.lower()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_output",
|
|
||||||
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
|
|
||||||
)
|
|
||||||
def test_chat_non_streaming_image(openai_client, input_output, correct_model_name):
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=correct_model_name,
|
|
||||||
messages=input_output["input"]["messages"],
|
|
||||||
stream=False,
|
|
||||||
)
|
|
||||||
assert response.choices[0].message.role == "assistant"
|
|
||||||
assert input_output["output"].lower() in response.choices[0].message.content.lower()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"])
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_output",
|
|
||||||
chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"],
|
|
||||||
)
|
|
||||||
def test_chat_streaming_image(openai_client, input_output, correct_model_name):
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=correct_model_name,
|
|
||||||
messages=input_output["input"]["messages"],
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
content = ""
|
|
||||||
for chunk in response:
|
|
||||||
content += chunk.choices[0].delta.content or ""
|
|
||||||
|
|
||||||
# TODO: add detailed type validation
|
|
||||||
|
|
||||||
assert input_output["output"].lower() in content.lower()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"model",
|
|
||||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_output",
|
|
||||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
|
|
||||||
)
|
|
||||||
def test_chat_non_streaming_structured_output(openai_client, input_output, correct_model_name):
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=correct_model_name,
|
|
||||||
messages=input_output["input"]["messages"],
|
|
||||||
response_format=input_output["input"]["response_format"],
|
|
||||||
stream=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert response.choices[0].message.role == "assistant"
|
|
||||||
maybe_json_content = response.choices[0].message.content
|
|
||||||
|
|
||||||
validate_structured_output(maybe_json_content, input_output["output"])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"model",
|
|
||||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_output",
|
|
||||||
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"],
|
|
||||||
)
|
|
||||||
def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=correct_model_name,
|
|
||||||
messages=input_output["input"]["messages"],
|
|
||||||
response_format=input_output["input"]["response_format"],
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
maybe_json_content = ""
|
|
||||||
for chunk in response:
|
|
||||||
maybe_json_content += chunk.choices[0].delta.content or ""
|
|
||||||
validate_structured_output(maybe_json_content, input_output["output"])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"model",
|
|
||||||
chat_completion_test_cases["test_tool_calling"]["test_params"]["model"],
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"input_output",
|
|
||||||
chat_completion_test_cases["test_tool_calling"]["test_params"]["input_output"],
|
|
||||||
)
|
|
||||||
def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name):
|
|
||||||
response = openai_client.chat.completions.create(
|
|
||||||
model=correct_model_name,
|
|
||||||
messages=input_output["input"]["messages"],
|
|
||||||
tools=input_output["input"]["tools"],
|
|
||||||
stream=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert response.choices[0].message.role == "assistant"
|
|
||||||
assert len(response.choices[0].message.tool_calls) > 0
|
|
||||||
assert input_output["output"] == "get_weather_tool_call"
|
|
||||||
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
|
|
||||||
# TODO: add detailed type validation
|
|
||||||
|
|
||||||
|
|
||||||
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
|
|
||||||
if schema_name == "valid_calendar_event":
|
|
||||||
|
|
||||||
class CalendarEvent(BaseModel):
|
|
||||||
name: str
|
|
||||||
date: str
|
|
||||||
participants: list[str]
|
|
||||||
|
|
||||||
try:
|
|
||||||
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
|
|
||||||
return calendar_event
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
elif schema_name == "valid_math_reasoning":
|
|
||||||
|
|
||||||
class Step(BaseModel):
|
|
||||||
explanation: str
|
|
||||||
output: str
|
|
||||||
|
|
||||||
class MathReasoning(BaseModel):
|
|
||||||
steps: list[Step]
|
|
||||||
final_answer: str
|
|
||||||
|
|
||||||
try:
|
|
||||||
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
|
|
||||||
return math_reasoning
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
|
|
||||||
structured_output = get_structured_output(maybe_json_content, schema_name)
|
|
||||||
assert structured_output is not None
|
|
||||||
if schema_name == "valid_calendar_event":
|
|
||||||
assert structured_output.name is not None
|
|
||||||
assert structured_output.date is not None
|
|
||||||
assert len(structured_output.participants) == 2
|
|
||||||
elif schema_name == "valid_math_reasoning":
|
|
||||||
assert len(structured_output.final_answer) > 0
|
|
105
tests/verifications/openai_api/fixtures/fixtures.py
Normal file
105
tests/verifications/openai_api/fixtures/fixtures.py
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
|
||||||
|
# --- Helper Function to Load Config ---
|
||||||
|
def _load_all_verification_configs():
|
||||||
|
"""Load and aggregate verification configs from the conf/ directory."""
|
||||||
|
# Note: Path is relative to *this* file (fixtures.py)
|
||||||
|
conf_dir = Path(__file__).parent.parent.parent / "conf"
|
||||||
|
if not conf_dir.is_dir():
|
||||||
|
# Use pytest.fail if called during test collection, otherwise raise error
|
||||||
|
# For simplicity here, we'll raise an error, assuming direct calls
|
||||||
|
# are less likely or can handle it.
|
||||||
|
raise FileNotFoundError(f"Verification config directory not found at {conf_dir}")
|
||||||
|
|
||||||
|
all_provider_configs = {}
|
||||||
|
yaml_files = list(conf_dir.glob("*.yaml"))
|
||||||
|
if not yaml_files:
|
||||||
|
raise FileNotFoundError(f"No YAML configuration files found in {conf_dir}")
|
||||||
|
|
||||||
|
for config_path in yaml_files:
|
||||||
|
provider_name = config_path.stem
|
||||||
|
try:
|
||||||
|
with open(config_path, "r") as f:
|
||||||
|
provider_config = yaml.safe_load(f)
|
||||||
|
if provider_config:
|
||||||
|
all_provider_configs[provider_name] = provider_config
|
||||||
|
else:
|
||||||
|
# Log warning if possible, or just skip empty files silently
|
||||||
|
print(f"Warning: Config file {config_path} is empty or invalid.")
|
||||||
|
except Exception as e:
|
||||||
|
raise IOError(f"Error loading config file {config_path}: {e}") from e
|
||||||
|
|
||||||
|
return {"providers": all_provider_configs}
|
||||||
|
|
||||||
|
|
||||||
|
# --- End Helper Function ---
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def verification_config():
|
||||||
|
"""Pytest fixture to provide the loaded verification config."""
|
||||||
|
try:
|
||||||
|
return _load_all_verification_configs()
|
||||||
|
except (FileNotFoundError, IOError) as e:
|
||||||
|
pytest.fail(str(e)) # Fail test collection if config loading fails
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def provider(request, verification_config):
|
||||||
|
provider = request.config.getoption("--provider")
|
||||||
|
base_url = request.config.getoption("--base-url")
|
||||||
|
|
||||||
|
if provider and base_url and verification_config["providers"][provider]["base_url"] != base_url:
|
||||||
|
raise ValueError(f"Provider {provider} is not supported for base URL {base_url}")
|
||||||
|
|
||||||
|
if not provider:
|
||||||
|
if not base_url:
|
||||||
|
raise ValueError("Provider and base URL are not provided")
|
||||||
|
for provider, metadata in verification_config["providers"].items():
|
||||||
|
if metadata["base_url"] == base_url:
|
||||||
|
provider = provider
|
||||||
|
break
|
||||||
|
|
||||||
|
return provider
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def base_url(request, provider, verification_config):
|
||||||
|
return request.config.getoption("--base-url") or verification_config["providers"][provider]["base_url"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def api_key(request, provider, verification_config):
|
||||||
|
provider_conf = verification_config.get("providers", {}).get(provider, {})
|
||||||
|
api_key_env_var = provider_conf.get("api_key_var")
|
||||||
|
|
||||||
|
key_from_option = request.config.getoption("--api-key")
|
||||||
|
key_from_env = os.getenv(api_key_env_var) if api_key_env_var else None
|
||||||
|
|
||||||
|
final_key = key_from_option or key_from_env
|
||||||
|
return final_key
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def model_mapping(provider, providers_model_mapping):
|
||||||
|
return providers_model_mapping[provider]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def openai_client(base_url, api_key):
|
||||||
|
return OpenAI(
|
||||||
|
base_url=base_url,
|
||||||
|
api_key=api_key,
|
||||||
|
)
|
|
@ -1,31 +1,24 @@
|
||||||
test_chat_basic:
|
test_chat_basic:
|
||||||
test_name: test_chat_basic
|
test_name: test_chat_basic
|
||||||
test_params:
|
test_params:
|
||||||
input_output:
|
case:
|
||||||
- input:
|
- case_id: "earth"
|
||||||
|
input:
|
||||||
messages:
|
messages:
|
||||||
- content: Which planet do humans live on?
|
- content: Which planet do humans live on?
|
||||||
role: user
|
role: user
|
||||||
output: Earth
|
output: Earth
|
||||||
- input:
|
- case_id: "saturn"
|
||||||
|
input:
|
||||||
messages:
|
messages:
|
||||||
- content: Which planet has rings around it with a name starting with letter
|
- content: Which planet has rings around it with a name starting with letter
|
||||||
S?
|
S?
|
||||||
role: user
|
role: user
|
||||||
output: Saturn
|
output: Saturn
|
||||||
model:
|
|
||||||
- Llama-3.3-8B-Instruct
|
|
||||||
- Llama-3.3-70B-Instruct
|
|
||||||
- Llama-4-Scout-17B-16E
|
|
||||||
- Llama-4-Scout-17B-16E-Instruct
|
|
||||||
- Llama-4-Maverick-17B-128E
|
|
||||||
- Llama-4-Maverick-17B-128E-Instruct
|
|
||||||
- gpt-4o
|
|
||||||
- gpt-4o-mini
|
|
||||||
test_chat_image:
|
test_chat_image:
|
||||||
test_name: test_chat_image
|
test_name: test_chat_image
|
||||||
test_params:
|
test_params:
|
||||||
input_output:
|
case:
|
||||||
- input:
|
- input:
|
||||||
messages:
|
messages:
|
||||||
- content:
|
- content:
|
||||||
|
@ -36,18 +29,12 @@ test_chat_image:
|
||||||
type: image_url
|
type: image_url
|
||||||
role: user
|
role: user
|
||||||
output: llama
|
output: llama
|
||||||
model:
|
|
||||||
- Llama-4-Scout-17B-16E
|
|
||||||
- Llama-4-Scout-17B-16E-Instruct
|
|
||||||
- Llama-4-Maverick-17B-128E
|
|
||||||
- Llama-4-Maverick-17B-128E-Instruct
|
|
||||||
- gpt-4o
|
|
||||||
- gpt-4o-mini
|
|
||||||
test_chat_structured_output:
|
test_chat_structured_output:
|
||||||
test_name: test_chat_structured_output
|
test_name: test_chat_structured_output
|
||||||
test_params:
|
test_params:
|
||||||
input_output:
|
case:
|
||||||
- input:
|
- case_id: "calendar"
|
||||||
|
input:
|
||||||
messages:
|
messages:
|
||||||
- content: Extract the event information.
|
- content: Extract the event information.
|
||||||
role: system
|
role: system
|
||||||
|
@ -77,7 +64,8 @@ test_chat_structured_output:
|
||||||
type: object
|
type: object
|
||||||
type: json_schema
|
type: json_schema
|
||||||
output: valid_calendar_event
|
output: valid_calendar_event
|
||||||
- input:
|
- case_id: "math"
|
||||||
|
input:
|
||||||
messages:
|
messages:
|
||||||
- content: You are a helpful math tutor. Guide the user through the solution
|
- content: You are a helpful math tutor. Guide the user through the solution
|
||||||
step by step.
|
step by step.
|
||||||
|
@ -118,19 +106,10 @@ test_chat_structured_output:
|
||||||
type: object
|
type: object
|
||||||
type: json_schema
|
type: json_schema
|
||||||
output: valid_math_reasoning
|
output: valid_math_reasoning
|
||||||
model:
|
|
||||||
- Llama-3.3-8B-Instruct
|
|
||||||
- Llama-3.3-70B-Instruct
|
|
||||||
- Llama-4-Scout-17B-16E
|
|
||||||
- Llama-4-Scout-17B-16E-Instruct
|
|
||||||
- Llama-4-Maverick-17B-128E
|
|
||||||
- Llama-4-Maverick-17B-128E-Instruct
|
|
||||||
- gpt-4o
|
|
||||||
- gpt-4o-mini
|
|
||||||
test_tool_calling:
|
test_tool_calling:
|
||||||
test_name: test_tool_calling
|
test_name: test_tool_calling
|
||||||
test_params:
|
test_params:
|
||||||
input_output:
|
case:
|
||||||
- input:
|
- input:
|
||||||
messages:
|
messages:
|
||||||
- content: You are a helpful assistant that can use tools to get information.
|
- content: You are a helpful assistant that can use tools to get information.
|
||||||
|
@ -152,11 +131,3 @@ test_tool_calling:
|
||||||
type: object
|
type: object
|
||||||
type: function
|
type: function
|
||||||
output: get_weather_tool_call
|
output: get_weather_tool_call
|
||||||
model:
|
|
||||||
- Llama-3.3-70B-Instruct
|
|
||||||
- Llama-4-Scout-17B-16E
|
|
||||||
- Llama-4-Scout-17B-16E-Instruct
|
|
||||||
- Llama-4-Maverick-17B-128E
|
|
||||||
- Llama-4-Maverick-17B-128E-Instruct
|
|
||||||
- gpt-4o
|
|
||||||
- gpt-4o-mini
|
|
271
tests/verifications/openai_api/test_chat_completion.py
Normal file
271
tests/verifications/openai_api/test_chat_completion.py
Normal file
|
@ -0,0 +1,271 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from tests.verifications.openai_api.fixtures.fixtures import _load_all_verification_configs
|
||||||
|
from tests.verifications.openai_api.fixtures.load import load_test_cases
|
||||||
|
|
||||||
|
chat_completion_test_cases = load_test_cases("chat_completion")
|
||||||
|
|
||||||
|
|
||||||
|
def case_id_generator(case):
|
||||||
|
"""Generate a test ID from the case's 'case_id' field, or use a default."""
|
||||||
|
case_id = case.get("case_id")
|
||||||
|
if isinstance(case_id, (str, int)):
|
||||||
|
return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_generate_tests(metafunc):
|
||||||
|
"""Dynamically parametrize tests based on the selected provider and config."""
|
||||||
|
if "model" in metafunc.fixturenames:
|
||||||
|
provider = metafunc.config.getoption("provider")
|
||||||
|
if not provider:
|
||||||
|
print("Warning: --provider not specified. Skipping model parametrization.")
|
||||||
|
metafunc.parametrize("model", [])
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
config_data = _load_all_verification_configs()
|
||||||
|
except (FileNotFoundError, IOError) as e:
|
||||||
|
print(f"ERROR loading verification configs: {e}")
|
||||||
|
config_data = {"providers": {}}
|
||||||
|
|
||||||
|
provider_config = config_data.get("providers", {}).get(provider)
|
||||||
|
if provider_config:
|
||||||
|
models = provider_config.get("models", [])
|
||||||
|
if models:
|
||||||
|
metafunc.parametrize("model", models)
|
||||||
|
else:
|
||||||
|
print(f"Warning: No models found for provider '{provider}' in config.")
|
||||||
|
metafunc.parametrize("model", []) # Parametrize empty if no models found
|
||||||
|
else:
|
||||||
|
print(f"Warning: Provider '{provider}' not found in config. No models parametrized.")
|
||||||
|
metafunc.parametrize("model", []) # Parametrize empty if provider not found
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
"""Check if a test should be skipped based on config exclusions."""
|
||||||
|
provider_config = verification_config.get("providers", {}).get(provider)
|
||||||
|
if not provider_config:
|
||||||
|
return False # No config for provider, don't skip
|
||||||
|
|
||||||
|
exclusions = provider_config.get("test_exclusions", {}).get(model, [])
|
||||||
|
return test_name_base in exclusions
|
||||||
|
|
||||||
|
|
||||||
|
# Helper to get the base test name from the request object
|
||||||
|
def get_base_test_name(request):
|
||||||
|
return request.node.originalname
|
||||||
|
|
||||||
|
|
||||||
|
# --- Test Functions ---
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_chat_non_streaming_basic(request, openai_client, model, provider, verification_config, case):
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=case["input"]["messages"],
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
assert response.choices[0].message.role == "assistant"
|
||||||
|
assert case["output"].lower() in response.choices[0].message.content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
chat_completion_test_cases["test_chat_basic"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=case["input"]["messages"],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
content = ""
|
||||||
|
for chunk in response:
|
||||||
|
content += chunk.choices[0].delta.content or ""
|
||||||
|
|
||||||
|
# TODO: add detailed type validation
|
||||||
|
|
||||||
|
assert case["output"].lower() in content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_chat_non_streaming_image(request, openai_client, model, provider, verification_config, case):
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=case["input"]["messages"],
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
assert response.choices[0].message.role == "assistant"
|
||||||
|
assert case["output"].lower() in response.choices[0].message.content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
chat_completion_test_cases["test_chat_image"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=case["input"]["messages"],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
content = ""
|
||||||
|
for chunk in response:
|
||||||
|
content += chunk.choices[0].delta.content or ""
|
||||||
|
|
||||||
|
# TODO: add detailed type validation
|
||||||
|
|
||||||
|
assert case["output"].lower() in content.lower()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_chat_non_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=case["input"]["messages"],
|
||||||
|
response_format=case["input"]["response_format"],
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.choices[0].message.role == "assistant"
|
||||||
|
maybe_json_content = response.choices[0].message.content
|
||||||
|
|
||||||
|
validate_structured_output(maybe_json_content, case["output"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
chat_completion_test_cases["test_chat_structured_output"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=case["input"]["messages"],
|
||||||
|
response_format=case["input"]["response_format"],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
maybe_json_content = ""
|
||||||
|
for chunk in response:
|
||||||
|
maybe_json_content += chunk.choices[0].delta.content or ""
|
||||||
|
validate_structured_output(maybe_json_content, case["output"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
chat_completion_test_cases["test_tool_calling"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=case["input"]["messages"],
|
||||||
|
tools=case["input"]["tools"],
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.choices[0].message.role == "assistant"
|
||||||
|
assert len(response.choices[0].message.tool_calls) > 0
|
||||||
|
assert case["output"] == "get_weather_tool_call"
|
||||||
|
assert response.choices[0].message.tool_calls[0].function.name == "get_weather"
|
||||||
|
# TODO: add detailed type validation
|
||||||
|
|
||||||
|
|
||||||
|
# --- Helper functions (structured output validation) ---
|
||||||
|
|
||||||
|
|
||||||
|
def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None:
|
||||||
|
if schema_name == "valid_calendar_event":
|
||||||
|
|
||||||
|
class CalendarEvent(BaseModel):
|
||||||
|
name: str
|
||||||
|
date: str
|
||||||
|
participants: list[str]
|
||||||
|
|
||||||
|
try:
|
||||||
|
calendar_event = CalendarEvent.model_validate_json(maybe_json_content)
|
||||||
|
return calendar_event
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
elif schema_name == "valid_math_reasoning":
|
||||||
|
|
||||||
|
class Step(BaseModel):
|
||||||
|
explanation: str
|
||||||
|
output: str
|
||||||
|
|
||||||
|
class MathReasoning(BaseModel):
|
||||||
|
steps: list[Step]
|
||||||
|
final_answer: str
|
||||||
|
|
||||||
|
try:
|
||||||
|
math_reasoning = MathReasoning.model_validate_json(maybe_json_content)
|
||||||
|
return math_reasoning
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:
|
||||||
|
structured_output = get_structured_output(maybe_json_content, schema_name)
|
||||||
|
assert structured_output is not None
|
||||||
|
if schema_name == "valid_calendar_event":
|
||||||
|
assert structured_output.name is not None
|
||||||
|
assert structured_output.date is not None
|
||||||
|
assert len(structured_output.participants) == 2
|
||||||
|
elif schema_name == "valid_math_reasoning":
|
||||||
|
assert len(structured_output.final_answer) > 0
|
File diff suppressed because it is too large
Load diff
1329
tests/verifications/test_results/fireworks_1744264202.json
Normal file
1329
tests/verifications/test_results/fireworks_1744264202.json
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
868
tests/verifications/test_results/openai_1744264304.json
Normal file
868
tests/verifications/test_results/openai_1744264304.json
Normal file
|
@ -0,0 +1,868 @@
|
||||||
|
{
|
||||||
|
"created": 1744264338.9923031,
|
||||||
|
"duration": 32.825536012649536,
|
||||||
|
"exitcode": 0,
|
||||||
|
"root": "/Users/erichuang/projects/llama-stack",
|
||||||
|
"environment": {},
|
||||||
|
"summary": {
|
||||||
|
"passed": 22,
|
||||||
|
"total": 22,
|
||||||
|
"collected": 22
|
||||||
|
},
|
||||||
|
"collectors": [
|
||||||
|
{
|
||||||
|
"nodeid": "",
|
||||||
|
"outcome": "passed",
|
||||||
|
"result": [
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
||||||
|
"type": "Module"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
||||||
|
"outcome": "passed",
|
||||||
|
"result": [
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 72
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 72
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 72
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 72
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 91
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 91
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 91
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 91
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 115
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 115
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 134
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 134
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 158
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 158
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 158
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 158
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 181
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 181
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 181
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 181
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 203
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
|
||||||
|
"type": "Function",
|
||||||
|
"lineno": 203
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"tests": [
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]",
|
||||||
|
"lineno": 72,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_basic[gpt-4o-earth]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-earth",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "earth"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.05381445901002735,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.49848275003023446,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00018287496641278267,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]",
|
||||||
|
"lineno": 72,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_basic[gpt-4o-saturn]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-saturn",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "saturn"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.007965500000864267,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.9293275829404593,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00018229195848107338,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]",
|
||||||
|
"lineno": 72,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_basic[gpt-4o-mini-earth]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-earth",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "earth"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.00875679193995893,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.5793640419142321,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0005307920509949327,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
|
||||||
|
"lineno": 72,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_basic[gpt-4o-mini-saturn]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-saturn",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "saturn"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.01076845801435411,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.8752291660057381,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0004834589781239629,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]",
|
||||||
|
"lineno": 91,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_basic[gpt-4o-earth]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-earth",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "earth"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.01662245800253004,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.8336971249664202,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0024086670018732548,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]",
|
||||||
|
"lineno": 91,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_basic[gpt-4o-saturn]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-saturn",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "saturn"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.009416291955858469,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.43594495789147913,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0009131249971687794,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]",
|
||||||
|
"lineno": 91,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_basic[gpt-4o-mini-earth]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-earth",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "earth"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.013155042077414691,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.6119836670113727,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00023804197553545237,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]",
|
||||||
|
"lineno": 91,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_basic[gpt-4o-mini-saturn]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-saturn",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "saturn"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.009004916995763779,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.8327413749648258,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00046841695439070463,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]",
|
||||||
|
"lineno": 115,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_image[gpt-4o-case0]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-case0",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "case0"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.009574208059348166,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 2.221839000005275,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00015945907216519117,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]",
|
||||||
|
"lineno": 115,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_image[gpt-4o-mini-case0]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-case0",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "case0"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.0084402080392465,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 2.298736457945779,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0002423750702291727,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]",
|
||||||
|
"lineno": 134,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_image[gpt-4o-case0]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-case0",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "case0"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.007330416003242135,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 4.062959833070636,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00015470804646611214,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]",
|
||||||
|
"lineno": 134,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_image[gpt-4o-mini-case0]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-case0",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "case0"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.019998832955025136,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 2.609432084020227,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.005618917057290673,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]",
|
||||||
|
"lineno": 158,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_structured_output[gpt-4o-calendar]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-calendar",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "calendar"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.00867662497330457,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.6856697499752045,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00018445902969688177,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]",
|
||||||
|
"lineno": 158,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_structured_output[gpt-4o-math]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-math",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "math"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.01139050000347197,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 2.764390083961189,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0003164170775562525,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||||
|
"lineno": 158,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-calendar",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "calendar"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.01321374997496605,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.8284227909753099,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00030170800164341927,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
|
||||||
|
"lineno": 158,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_structured_output[gpt-4o-mini-math]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-math",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "math"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.013477458036504686,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 2.4146235829684883,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00025754200760275126,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]",
|
||||||
|
"lineno": 181,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_structured_output[gpt-4o-calendar]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-calendar",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "calendar"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.006940583931282163,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.5102092920569703,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00023379107005894184,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]",
|
||||||
|
"lineno": 181,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_structured_output[gpt-4o-math]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-math",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "math"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.007166999974288046,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 3.5751801669830456,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00015041697770357132,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||||
|
"lineno": 181,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_structured_output[gpt-4o-mini-calendar]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-calendar",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "calendar"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.010652625001966953,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.6648182499920949,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0008647920330986381,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]",
|
||||||
|
"lineno": 181,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_streaming_structured_output[gpt-4o-mini-math]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-math",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "math"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.007372208056040108,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 2.80747462506406,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.00028124998789280653,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]",
|
||||||
|
"lineno": 203,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_tool_calling[gpt-4o-case0]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-case0",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"case_id": "case0"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.01625587500166148,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.6878769160248339,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0002637499710544944,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
|
||||||
|
"lineno": 203,
|
||||||
|
"outcome": "passed",
|
||||||
|
"keywords": [
|
||||||
|
"test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]",
|
||||||
|
"parametrize",
|
||||||
|
"pytestmark",
|
||||||
|
"gpt-4o-mini-case0",
|
||||||
|
"test_chat_completion.py",
|
||||||
|
"openai_api",
|
||||||
|
"verifications",
|
||||||
|
"tests",
|
||||||
|
"llama-stack",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"model": "gpt-4o-mini",
|
||||||
|
"case_id": "case0"
|
||||||
|
},
|
||||||
|
"setup": {
|
||||||
|
"duration": 0.008817250025458634,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"call": {
|
||||||
|
"duration": 0.7181202919455245,
|
||||||
|
"outcome": "passed"
|
||||||
|
},
|
||||||
|
"teardown": {
|
||||||
|
"duration": 0.0017147079342976213,
|
||||||
|
"outcome": "passed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
File diff suppressed because it is too large
Load diff
1420
tests/verifications/test_results/together_1744264258.json
Normal file
1420
tests/verifications/test_results/together_1744264258.json
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue