mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-24 05:14:30 +00:00
feat(verification): various improvements (#1921)
# What does this PR do? - provider and their models now live in config.yaml - better distinguish different cases within a test - add model key to surface provider's model_id - include example command to rerun single test case ## Test Plan <img width="1173" alt="image" src="https://github.com/user-attachments/assets/b414baf0-c768-451f-8c3b-c2905cf36fac" />
This commit is contained in:
parent
09a83b1ec1
commit
14146e4b3f
22 changed files with 4449 additions and 8810 deletions
|
@ -1,6 +1,6 @@
|
|||
# Test Results Report
|
||||
|
||||
*Generated on: 2025-04-08 21:14:02*
|
||||
*Generated on: 2025-04-09 22:52:19*
|
||||
|
||||
*This report was generated by running `python tests/verifications/generate_report.py`*
|
||||
|
||||
|
@ -23,66 +23,107 @@
|
|||
|
||||
## Together
|
||||
|
||||
*Tests run on: 2025-04-08 16:19:59*
|
||||
*Tests run on: 2025-04-09 22:50:58*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=together -v
|
||||
# Run all tests for this provider:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -v
|
||||
|
||||
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=together -k "test_chat_non_streaming_basic and earth"
|
||||
```
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
||||
|
||||
**Model Key (Together)**
|
||||
|
||||
| Display Name | Full Model ID |
|
||||
| --- | --- |
|
||||
| Llama-3.3-70B-Instruct | `meta-llama/Llama-3.3-70B-Instruct-Turbo` |
|
||||
| Llama-4-Maverick-Instruct | `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8` |
|
||||
| Llama-4-Scout-Instruct | `meta-llama/Llama-4-Scout-17B-16E-Instruct` |
|
||||
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
|
||||
| --- | --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_image (case 0) | ⚪ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (case 1) | ✅ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_image | ⚪ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
|
||||
|
||||
## Fireworks
|
||||
|
||||
*Tests run on: 2025-04-08 16:18:28*
|
||||
*Tests run on: 2025-04-09 22:50:02*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=fireworks -v
|
||||
# Run all tests for this provider:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -v
|
||||
|
||||
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=fireworks -k "test_chat_non_streaming_basic and earth"
|
||||
```
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct |
|
||||
|
||||
**Model Key (Fireworks)**
|
||||
|
||||
| Display Name | Full Model ID |
|
||||
| --- | --- |
|
||||
| Llama-3.3-70B-Instruct | `accounts/fireworks/models/llama-v3p3-70b-instruct` |
|
||||
| Llama-4-Maverick-Instruct | `accounts/fireworks/models/llama4-maverick-instruct-basic` |
|
||||
| Llama-4-Scout-Instruct | `accounts/fireworks/models/llama4-scout-instruct-basic` |
|
||||
|
||||
|
||||
| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-Instruct | Llama-4-Scout-Instruct |
|
||||
| --- | --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_image (case 0) | ⚪ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 1) | ❌ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
|
||||
## Openai
|
||||
|
||||
*Tests run on: 2025-04-08 16:22:02*
|
||||
*Tests run on: 2025-04-09 22:51:44*
|
||||
|
||||
```bash
|
||||
pytest tests/verifications/openai/test_chat_completion.py --provider=openai -v
|
||||
# Run all tests for this provider:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -v
|
||||
|
||||
# Example: Run only the 'earth' case of test_chat_non_streaming_basic:
|
||||
pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai -k "test_chat_non_streaming_basic and earth"
|
||||
```
|
||||
|
||||
|
||||
**Model Key (Openai)**
|
||||
|
||||
| Display Name | Full Model ID |
|
||||
| --- | --- |
|
||||
| gpt-4o | `gpt-4o` |
|
||||
| gpt-4o-mini | `gpt-4o-mini` |
|
||||
|
||||
|
||||
| Test | gpt-4o | gpt-4o-mini |
|
||||
| --- | --- | --- |
|
||||
| test_chat_non_streaming_basic (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (case 1) | ✅ | ✅ |
|
||||
| test_chat_streaming_image (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 0) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (case 1) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ✅ |
|
||||
| test_chat_streaming_image | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ✅ |
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue