mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-15 22:47:59 +00:00
# What does this PR do?
## Test Plan
(myenv) ➜ llama-stack python tests/verifications/generate_report.py
--providers fireworks,together,openai --run-tests
f27f617629/tests/verifications/REPORT.md
3578 lines
246 KiB
JSON
3578 lines
246 KiB
JSON
{
|
|
"created": 1744679387.346831,
|
|
"duration": 90.31976795196533,
|
|
"exitcode": 1,
|
|
"root": "/Users/erichuang/projects/llama-stack",
|
|
"environment": {},
|
|
"summary": {
|
|
"passed": 37,
|
|
"failed": 39,
|
|
"skipped": 2,
|
|
"total": 78,
|
|
"collected": 78
|
|
},
|
|
"collectors": [
|
|
{
|
|
"nodeid": "",
|
|
"outcome": "passed",
|
|
"result": [
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"type": "Module"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"outcome": "passed",
|
|
"result": [
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 117
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 117
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 117
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 136
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 136
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 136
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 205
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 205
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 205
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 229
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 229
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 229
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 257
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 257
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 257
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 281
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 281
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 281
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 308
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 308
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 308
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 331
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 331
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 331
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 359
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 450
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"tests": [
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.1559112500399351,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3692209171131253,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00021362490952014923,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007326166843995452,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.49173945817165077,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00034487503580749035,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.021014458034187555,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.36956487502902746,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0007119579240679741,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.011922625126317143,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.7763332079630345,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004842919297516346,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.023896750062704086,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.9817597079090774,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004768748767673969,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.07423937506973743,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3721332079730928,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00020033284090459347,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"lineno": 93,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.010166750056669116,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.41266337502747774,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00034358282573521137,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"lineno": 93,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.016687541967257857,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.7235856249462813,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00027179205790162086,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.012556416913866997,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.27039612480439246,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]>>\nopenai_client = <openai.OpenAI object at 0x10e10d930>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002312080468982458,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006413874914869666,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.36463545891456306,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]>>\nopenai_client = <openai.OpenAI object at 0x107b73a00>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00023154192604124546,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.015633082948625088,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.8896284159272909,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]>>\nopenai_client = <openai.OpenAI object at 0x107bd7400>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0006587498355656862,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.012669583084061742,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3499396659899503,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]>>\nopenai_client = <openai.OpenAI object at 0x107b736d0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024912506341934204,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 117,
|
|
"outcome": "skipped",
|
|
"keywords": [
|
|
"test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0153201250359416,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.0001901669893413782,
|
|
"outcome": "skipped",
|
|
"longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 126, 'Skipped: Skipping test_chat_non_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00012779212556779385,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 117,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008855124935507774,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.37906050006859,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004904591478407383,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 117,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.017166708130389452,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 4.003400916932151,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00042724981904029846,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 136,
|
|
"outcome": "skipped",
|
|
"keywords": [
|
|
"test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007232750067487359,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.0001449580304324627,
|
|
"outcome": "skipped",
|
|
"longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 145, 'Skipped: Skipping test_chat_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0001349160447716713,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 136,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007052165921777487,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.4663615000899881,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e1f9930>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:154: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0005696250591427088,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 136,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01214433298446238,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.902559082955122,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e1dd840>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:154: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.000591374933719635,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01478054211474955,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.569845792138949,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00038724998012185097,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.014717916958034039,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.1819656670559198,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002410421147942543,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006486707832664251,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5623017910402268,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00032504182308912277,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009171125013381243,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.6005691669415683,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00023995805531740189,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009700333932414651,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4192442081402987,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00040241610258817673,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006938542006537318,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.1736337919719517,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00019279099069535732,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"lineno": 183,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008775749942287803,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5588400410488248,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00040091690607368946,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"lineno": 183,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01844154205173254,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.205772665794939,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00021091708913445473,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.015595750184729695,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.6904467919375747,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]>>\nopenai_client = <openai.OpenAI object at 0x10e1f5db0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002907498273998499,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008272957988083363,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.499622541014105,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]>>\nopenai_client = <openai.OpenAI object at 0x10e13baf0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0005947079043835402,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.013340875040739775,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.42789591709151864,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]>>\nopenai_client = <openai.OpenAI object at 0x10e1c41f0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0003039578441530466,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01058275019749999,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 5.795635707909241,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]>>\nopenai_client = <openai.OpenAI object at 0x10e1f4f10>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0005178749561309814,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 205,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.014336749911308289,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.451304541900754,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004718329291790724,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 205,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01625004201196134,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5111537908669561,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00046774977818131447,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 205,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.015832332894206047,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.8238586660008878,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0006185418460518122,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 229,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007832166040316224,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.685583250131458,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004414590075612068,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 229,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.021764083998277783,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.35617320891469717,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 247,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e482140>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:247: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e48fd30>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0005425831768661737,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 229,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.016708041075617075,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.49443637509830296,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 247,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e1026e0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:247: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e45f640>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002642078325152397,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 257,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009570583933964372,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5232214999850839,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0006591668352484703,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 257,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01567283389158547,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4465816249139607,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0003922500181943178,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 257,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.021711332956328988,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5361095829866827,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0003099590539932251,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 281,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009334125090390444,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5789772500284016,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00037712487392127514,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 281,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.019614499993622303,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.444399792002514,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 300,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e491a50>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:300: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e100280>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004192921333014965,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 281,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.012822834076359868,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.6777042911853641,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 300,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e4ef460>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:300: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e45c7f0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004483328666538,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 308,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.011924332939088345,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4756374170538038,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 328,
|
|
"message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]), seed=13421903014786785000).message"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 328,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e48e6b0>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]), seed=13421903014786785000).message\n\ntests/verifications/openai_api/test_chat_completion.py:328: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004585420247167349,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 308,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.013246082933619618,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5618870409671217,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 328,
|
|
"message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]), seed=None).message"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 328,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e4ef2e0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]), seed=None).message\n\ntests/verifications/openai_api/test_chat_completion.py:328: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00025883293710649014,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 308,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008055417099967599,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.32869591703638434,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 328,
|
|
"message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 328,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e4fa620>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message\n\ntests/verifications/openai_api/test_chat_completion.py:328: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0003937501460313797,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 331,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.013460749993100762,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.35879983310587704,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 355,
|
|
"message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 355,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e102050>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:355: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002649170346558094,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 331,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0068365419283509254,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5351063329726458,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 355,
|
|
"message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 355,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e4fb3d0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:355: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004712918307632208,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 331,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.014073874801397324,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.6729549579322338,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 355,
|
|
"message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 355,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10e46e8f0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:355: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.000251916004344821,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"lineno": 359,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009340125136077404,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3328715830575675,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 418,
|
|
"message": "AssertionError: Expected 0 tool calls, but got 1\nassert 1 == 0\n + where 1 = len(([ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]))\n + where [ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 418,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e41a530>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 0 tool calls, but got 1\nE assert 1 == 0\nE + where 1 = len(([ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]))\nE + where [ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00042020808905363083,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01490145898424089,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.8346118750050664,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00034404080361127853,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.014493625145405531,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.8973606249783188,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00021345820277929306,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009358166949823499,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 4.5295154170598835,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002461671829223633,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"lineno": 359,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009552374947816133,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.34176899981684983,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 429,
|
|
"message": "AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nassert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\n \n Differing items:\n {'month': '1'} != {'month': 1}\n {'year': '2025'} != {'year': 2025}\n \n Full diff:\n {...\n \n ...Full output truncated (7 lines hidden), use '-vv' to show"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 429,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]>>\nopenai_client = <openai.OpenAI object at 0x107bd7820>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n> assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\nE AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nE assert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\nE \nE Differing items:\nE {'month': '1'} != {'month': 1}\nE {'year': '2025'} != {'year': 2025}\nE \nE Full diff:\nE {...\nE \nE ...Full output truncated (7 lines hidden), use '-vv' to show\n\ntests/verifications/openai_api/test_chat_completion.py:429: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.000527665950357914,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"lineno": 359,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.012501416960731149,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.585734374821186,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 418,
|
|
"message": "AssertionError: Expected 0 tool calls, but got 2\nassert 2 == 0\n + where 2 = len(([ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]))\n + where [ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 418,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e19b910>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 0 tool calls, but got 2\nE assert 2 == 0\nE + where 2 = len(([ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]))\nE + where [ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0003941669128835201,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.014057958032935858,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.7121559998486191,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00048266700468957424,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.02072141715325415,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.0424797078594565,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004878339823335409,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.018570583080872893,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.4340267919469625,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00023016706109046936,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009570334106683731,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.2068665840197355,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00051837507635355,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"lineno": 359,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01873366697691381,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5193468749057502,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 446,
|
|
"message": "AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": null, \"parameters\": null}'\nassert False\n + where False = any(<generator object test_chat_non_streaming_multi_turn_tool_calling.<locals>.<genexpr> at 0x10e4c0f90>)"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 446,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x1079fc160>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call.id,\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n assert assistant_message.content is not None, \"Expected content, but none received.\"\n expected_answers = expected[\"answer\"] # This is now a list\n content_lower = assistant_message.content.lower()\n> assert any(ans.lower() in content_lower for ans in expected_answers), (\n f\"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'\"\n )\nE AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": null, \"parameters\": null}'\nE assert False\nE + where False = any(<generator object test_chat_non_streaming_multi_turn_tool_calling.<locals>.<genexpr> at 0x10e4c0f90>)\n\ntests/verifications/openai_api/test_chat_completion.py:446: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004933748859912157,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.014272749889642,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.911199334077537,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00043049990199506283,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.031040542060509324,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.0026419160421938,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00045104208402335644,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"lineno": 359,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.016529500018805265,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.7563346249517053,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 429,
|
|
"message": "AssertionError: Expected arguments '{'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00', 'location': 'Main Conference Room', 'participants': ['Alice', 'Bob', 'Charlie']}', got '{'participants': '[\"Alice\", \"Bob\", \"Charlie\"]', 'location': 'Main Conference Room', 'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00'}'\nassert {'date': '202...arlie\"]', ...} == {'date': '202...harlie'], ...}\n \n Omitting 4 identical items, use -vv to show\n Differing items:\n {'participants': '[\"Alice\", \"Bob\", \"Charlie\"]'} != {'participants': ['Alice', 'Bob', 'Charlie']}\n \n Full diff:\n {...\n \n ...Full output truncated (11 lines hidden), use '-vv' to show"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 429,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e1c7430>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n> assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\nE AssertionError: Expected arguments '{'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00', 'location': 'Main Conference Room', 'participants': ['Alice', 'Bob', 'Charlie']}', got '{'participants': '[\"Alice\", \"Bob\", \"Charlie\"]', 'location': 'Main Conference Room', 'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00'}'\nE assert {'date': '202...arlie\"]', ...} == {'date': '202...harlie'], ...}\nE \nE Omitting 4 identical items, use -vv to show\nE Differing items:\nE {'participants': '[\"Alice\", \"Bob\", \"Charlie\"]'} != {'participants': ['Alice', 'Bob', 'Charlie']}\nE \nE Full diff:\nE {...\nE \nE ...Full output truncated (11 lines hidden), use '-vv' to show\n\ntests/verifications/openai_api/test_chat_completion.py:429: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0005542081780731678,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"lineno": 359,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.013607957866042852,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.0105869588442147,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0004793750122189522,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01806124998256564,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3295827910769731,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 500,
|
|
"message": "AssertionError: Expected 0 tool calls, but got 1\nassert 1 == 0\n + where 1 = len(([{'function': {'arguments': '{\"location\":\"San Francisco, CA\"}', 'name': 'get_weather'}, 'id': 'call_l066e8oey2i8exeodczlv1mh', 'type': 'function'}]))"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 500,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e45efe0>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 0 tool calls, but got 1\nE assert 1 == 0\nE + where 1 = len(([{'function': {'arguments': '{\"location\":\"San Francisco, CA\"}', 'name': 'get_weather'}, 'id': 'call_l066e8oey2i8exeodczlv1mh', 'type': 'function'}]))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002942080609500408,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007637625094503164,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.021851292112842,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 526,
|
|
"message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 526,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]>>\nopenai_client = <openai.OpenAI object at 0x10e409150>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:526: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00036791712045669556,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"lineno": 450,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.013031583046540618,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.8596610419917852,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00042829103767871857,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.015244666952639818,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.0227877080906183,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 526,
|
|
"message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 526,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e1c5030>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:526: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024933391250669956,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008626125054433942,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3212552920449525,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 512,
|
|
"message": "AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nassert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\n \n Differing items:\n {'month': '1'} != {'month': 1}\n {'year': '2025'} != {'year': 2025}\n \n Full diff:\n {...\n \n ...Full output truncated (7 lines hidden), use '-vv' to show"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 512,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]>>\nopenai_client = <openai.OpenAI object at 0x10eb5ec80>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n> assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\nE AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nE assert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\nE \nE Differing items:\nE {'month': '1'} != {'month': 1}\nE {'year': '2025'} != {'year': 2025}\nE \nE Full diff:\nE {...\nE \nE ...Full output truncated (7 lines hidden), use '-vv' to show\n\ntests/verifications/openai_api/test_chat_completion.py:512: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00020562508143484592,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007338125025853515,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4175920831039548,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e4ed870>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e1f42b0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00023462506942451,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007788832997903228,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.45610866602510214,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]>>\nopenai_client = <openai.OpenAI object at 0x10eb5d960>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e446e00>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00021450011990964413,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006751166889443994,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.7053082089405507,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e19ba30>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e10f0a0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00021783309057354927,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008729791967198253,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5665898330044001,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e18bb50>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x107b3de10>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002288338728249073,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009526000125333667,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.1714977910742164,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]>>\nopenai_client = <openai.OpenAI object at 0x107a41b70>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e48f5b0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00032483390532433987,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.010107750073075294,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.26202141703106463,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e18af20>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e1a0ca0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00022558285854756832,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008256082888692617,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3466235001105815,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]>>\nopenai_client = <openai.OpenAI object at 0x10e46eb00>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10eb2b640>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.000535458093509078,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0180504999589175,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.8803812500555068,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e1dce50>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e45eb00>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00025062495842576027,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.00993091706186533,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5258524999953806,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e1dcaf0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e480790>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002823749091476202,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"lineno": 450,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.047535917023196816,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4426498331595212,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 485,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 587,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]>>\nopenai_client = <openai.OpenAI object at 0x10e1dc310>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10e459ab0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0010368749499320984,
|
|
"outcome": "passed"
|
|
}
|
|
}
|
|
],
|
|
"run_timestamp": 1744679294
|
|
}
|