mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-23 04:53:14 +00:00
test(verification): more tests, multiturn tool use tests (#1954)
# What does this PR do?
## Test Plan
(myenv) ➜ llama-stack python tests/verifications/generate_report.py
--providers fireworks,together,openai --run-tests
f27f617629/tests/verifications/REPORT.md
This commit is contained in:
parent
86c6f1f112
commit
32e3da7392
6 changed files with 6274 additions and 599 deletions
|
@ -1,6 +1,6 @@
|
|||
# Test Results Report
|
||||
|
||||
*Generated on: 2025-04-10 16:48:18*
|
||||
*Generated on: 2025-04-14 18:11:37*
|
||||
|
||||
*This report was generated by running `python tests/verifications/generate_report.py`*
|
||||
|
||||
|
@ -15,15 +15,15 @@
|
|||
|
||||
| Provider | Pass Rate | Tests Passed | Total Tests |
|
||||
| --- | --- | --- | --- |
|
||||
| Together | 64.7% | 22 | 34 |
|
||||
| Fireworks | 82.4% | 28 | 34 |
|
||||
| Openai | 100.0% | 24 | 24 |
|
||||
| Together | 48.7% | 37 | 76 |
|
||||
| Fireworks | 47.4% | 36 | 76 |
|
||||
| Openai | 100.0% | 52 | 52 |
|
||||
|
||||
|
||||
|
||||
## Together
|
||||
|
||||
*Tests run on: 2025-04-10 16:46:35*
|
||||
*Tests run on: 2025-04-14 18:08:14*
|
||||
|
||||
```bash
|
||||
# Run all tests for this provider:
|
||||
|
@ -48,19 +48,33 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe
|
|||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ❌ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_choice_none | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_tool_choice_required | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_image | ⚪ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_tool_calling | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_tool_choice_none | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ |
|
||||
|
||||
## Fireworks
|
||||
|
||||
*Tests run on: 2025-04-10 16:44:44*
|
||||
*Tests run on: 2025-04-14 18:04:06*
|
||||
|
||||
```bash
|
||||
# Run all tests for this provider:
|
||||
|
@ -85,19 +99,33 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=firewor
|
|||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_tool_choice_none | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_choice_required | ✅ | ❌ | ❌ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_tool_calling | ❌ | ❌ | ❌ |
|
||||
| test_chat_streaming_tool_choice_none | ✅ | ✅ | ✅ |
|
||||
| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ |
|
||||
|
||||
## Openai
|
||||
|
||||
*Tests run on: 2025-04-10 16:47:28*
|
||||
*Tests run on: 2025-04-14 18:09:51*
|
||||
|
||||
```bash
|
||||
# Run all tests for this provider:
|
||||
|
@ -121,12 +149,26 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai
|
|||
| test_chat_non_streaming_basic (earth) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (math) | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_calling | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_choice_none | ✅ | ✅ |
|
||||
| test_chat_non_streaming_tool_choice_required | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (earth) | ✅ | ✅ |
|
||||
| test_chat_streaming_basic (saturn) | ✅ | ✅ |
|
||||
| test_chat_streaming_image | ✅ | ✅ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ |
|
||||
| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (calendar) | ✅ | ✅ |
|
||||
| test_chat_streaming_structured_output (math) | ✅ | ✅ |
|
||||
| test_chat_streaming_tool_calling | ✅ | ✅ |
|
||||
| test_chat_streaming_tool_choice_none | ✅ | ✅ |
|
||||
| test_chat_streaming_tool_choice_required | ✅ | ✅ |
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue