mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-28 19:12:00 +00:00
# What does this PR do? ## Test Plan uv run --with-editable ".[dev]" python tests/verifications/generate_report.py --run-tests
3555 lines
237 KiB
JSON
3555 lines
237 KiB
JSON
{
|
|
"created": 1744841154.6007879,
|
|
"duration": 120.4372878074646,
|
|
"exitcode": 1,
|
|
"root": "/Users/erichuang/projects/llama-stack",
|
|
"environment": {},
|
|
"summary": {
|
|
"passed": 39,
|
|
"failed": 37,
|
|
"skipped": 2,
|
|
"total": 78,
|
|
"collected": 78
|
|
},
|
|
"collectors": [
|
|
{
|
|
"nodeid": "",
|
|
"outcome": "passed",
|
|
"result": [
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"type": "Module"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"outcome": "passed",
|
|
"result": [
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"type": "Function",
|
|
"lineno": 74
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"type": "Function",
|
|
"lineno": 93
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 117
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 117
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 117
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 136
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 136
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 136
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"type": "Function",
|
|
"lineno": 160
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"type": "Function",
|
|
"lineno": 183
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 205
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 205
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 205
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 229
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 229
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 229
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 257
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 257
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 257
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 282
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 282
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 282
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 309
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 309
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 309
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"type": "Function",
|
|
"lineno": 332
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"type": "Function",
|
|
"lineno": 332
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"type": "Function",
|
|
"lineno": 332
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 360
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"type": "Function",
|
|
"lineno": 451
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"tests": [
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.21532604098320007,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.9991857919376343,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0001563748810440302,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007130792131647468,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.1308259170036763,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015199999324977398,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.015451540937647223,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.8688064580783248,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015308288857340813,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007731583202257752,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.46771004190668464,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0007200830150395632,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007446125149726868,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.3933757909107953,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.002874624915421009,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"lineno": 74,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01013387506827712,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.39105829200707376,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015466706827282906,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"lineno": 93,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008418583078309894,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4248087501619011,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00016704201698303223,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"lineno": 93,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007518124999478459,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.7563416250050068,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00016262498684227467,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009950791951268911,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.2686829590238631,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]>>\nopenai_client = <openai.OpenAI object at 0x10c61d7e0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002637500874698162,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.011679667048156261,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4552199998870492,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]>>\nopenai_client = <openai.OpenAI object at 0x10c61c4f0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024562515318393707,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "earth"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007694624830037355,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.998882583109662,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]>>\nopenai_client = <openai.OpenAI object at 0x10c175810>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00022433395497500896,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"lineno": 93,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "saturn"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006812750129029155,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.34369166707620025,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 111,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]>>\nopenai_client = <openai.OpenAI object at 0x10c61c7c0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00029608397744596004,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 117,
|
|
"outcome": "skipped",
|
|
"keywords": [
|
|
"test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006911124801263213,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.00013570813462138176,
|
|
"outcome": "skipped",
|
|
"longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 126, 'Skipped: Skipping test_chat_non_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00011799996718764305,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 117,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007865542080253363,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.211856249952689,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015016691759228706,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 117,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007291208021342754,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 4.980133082950488,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002584999892860651,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 136,
|
|
"outcome": "skipped",
|
|
"keywords": [
|
|
"test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009254832984879613,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.00016950001008808613,
|
|
"outcome": "skipped",
|
|
"longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 145, 'Skipped: Skipping test_chat_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0001239590346813202,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 136,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.019581791944801807,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.487935832934454,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10c6a37f0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:154: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024645915254950523,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 136,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01211779098957777,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.920052665984258,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 154,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10c175f30>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:154: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00047275004908442497,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01848520804196596,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.4586717090569437,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002318748738616705,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0069474580232053995,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.9735800828784704,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00016279099509119987,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006996707990765572,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.6836131250020117,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015366706065833569,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0066205840557813644,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.5288485831115395,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015287497080862522,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007501666899770498,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5137577499262989,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015366706065833569,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"lineno": 160,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0072085000574588776,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.893309208098799,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00017254101112484932,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"lineno": 183,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006752792047336698,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.520758124999702,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00022079190239310265,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"lineno": 183,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008957375073805451,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 15.490330374799669,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00014704209752380848,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007771959062665701,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.644345791079104,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]>>\nopenai_client = <openai.OpenAI object at 0x10c6bdb40>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024341698735952377,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008734249975532293,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 4.31767199980095,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]>>\nopenai_client = <openai.OpenAI object at 0x10ca5bc10>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00026674987748265266,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "calendar"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006908582989126444,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.46308279200457036,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]>>\nopenai_client = <openai.OpenAI object at 0x10b7bffa0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0003908751532435417,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"lineno": 183,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "math"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0073979999870061874,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.537265666993335,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 202,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]>>\nopenai_client = <openai.OpenAI object at 0x10c6c5d50>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00026933313347399235,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 205,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007018249947577715,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.0225670000072569,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00030558393336832523,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 205,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007612749934196472,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.35967333405278623,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00023795804008841515,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 205,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007069834042340517,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3653114167973399,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015424983575940132,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 229,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007679749978706241,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5530709580052644,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00016416702419519424,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 229,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007491416065022349,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4884651671163738,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 247,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10ca426b0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:247: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10caac9d0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002495420631021261,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 229,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.00810704194009304,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4408426668960601,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 247,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10c000400>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:247: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10caaeec0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002715839073061943,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 257,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008122375002130866,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.2647117911837995,
|
|
"outcome": "passed",
|
|
"stdout": "ChatCompletion(id='nqNdhnC-2j9zxn-9316fb372a8dcfc8', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_bmer2gstj7kb3av5poqbufp1', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]), seed=14065825304993057000)], created=1744841096, model='meta-llama/Llama-3.3-70B-Instruct-Turbo', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=26, prompt_tokens=220, total_tokens=246, completion_tokens_details=None, prompt_tokens_details=None, cached_tokens=0), prompt=[])\n"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00014750007539987564,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 257,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.00704649998806417,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.42037149984389544,
|
|
"outcome": "passed",
|
|
"stdout": "ChatCompletion(id='nqNdi94-2j9zxn-9316fb3eef09ebe3', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_wmv7dk50bsnhnk2poocg0cwl', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None)], created=1744841098, model='meta-llama/Llama-4-Scout-17B-16E-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=18, prompt_tokens=198, total_tokens=216, completion_tokens_details=None, prompt_tokens_details=None), prompt=[])\n"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00017291703261435032,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 257,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008176584029570222,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3381002079695463,
|
|
"outcome": "passed",
|
|
"stdout": "ChatCompletion(id='nqNdiFd-28Eivz-9316fb419863944d', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_5h00zb6me3342igyllvyrjj7', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None)], created=1744841098, model='meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=18, prompt_tokens=198, total_tokens=216, completion_tokens_details=None, prompt_tokens_details=None), prompt=[])\n"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015812506899237633,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 282,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009897291893139482,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.5261498331092298,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002149590291082859,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 282,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007385874865576625,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5376293750014156,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 301,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10ca42da0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:301: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10ca41b40>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002947079483419657,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 282,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008081958163529634,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.4107254999689758,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 301,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10ca6b6a0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:301: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10ca85db0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00025158398784697056,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 309,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.010461833095178008,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.1223525418899953,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 329,
|
|
"message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]), seed=1754099529794631000).message"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 329,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]>>\nopenai_client = <openai.OpenAI object at 0x10c6c6560>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_g9yti6yqsw38wvtvndlflei7', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]), seed=1754099529794631000).message\n\ntests/verifications/openai_api/test_chat_completion.py:329: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002299160696566105,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 309,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0073735828045755625,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.38580279191955924,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 329,
|
|
"message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 329,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10ca42da0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_f3d5174dyb3hxwsnotdhu0bn', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message\n\ntests/verifications/openai_api/test_chat_completion.py:329: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00027966685593128204,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 309,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006746791070327163,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3289988338947296,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 329,
|
|
"message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 329,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10ce06bc0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z5imwjfzlce7v1sjx2x7z7rj', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message\n\ntests/verifications/openai_api/test_chat_completion.py:329: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002757080364972353,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"lineno": 332,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006751707987859845,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.8982260411139578,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 356,
|
|
"message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_x4m8hvw4d9iktfabb0lwwagm', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_x4m8hvw4d9iktfabb0lwwagm', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_x4m8hvw4d9iktfabb0lwwagm', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 356,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]>>\nopenai_client = <openai.OpenAI object at 0x10c6ffac0>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_x4m8hvw4d9iktfabb0lwwagm', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_x4m8hvw4d9iktfabb0lwwagm', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_x4m8hvw4d9iktfabb0lwwagm', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:356: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00020166696049273014,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"lineno": 332,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007537916069850326,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.463320666924119,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 356,
|
|
"message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_d4wm4bj2gtl64dbr8p9yvwxe', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_d4wm4bj2gtl64dbr8p9yvwxe', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_d4wm4bj2gtl64dbr8p9yvwxe', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 356,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]>>\nopenai_client = <openai.OpenAI object at 0x10cadf460>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_d4wm4bj2gtl64dbr8p9yvwxe', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_d4wm4bj2gtl64dbr8p9yvwxe', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_d4wm4bj2gtl64dbr8p9yvwxe', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:356: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002644169144332409,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"lineno": 332,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "case0"
|
|
},
|
|
"setup": {
|
|
"duration": 0.010220374912023544,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3469825841020793,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 356,
|
|
"message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_q4lv7coily23gc1z694vgpn8', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_q4lv7coily23gc1z694vgpn8', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_q4lv7coily23gc1z694vgpn8', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 356,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]>>\nopenai_client = <openai.OpenAI object at 0x10ca40ca0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_q4lv7coily23gc1z694vgpn8', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_q4lv7coily23gc1z694vgpn8', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_q4lv7coily23gc1z694vgpn8', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:356: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00033033289946615696,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"lineno": 360,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0076314168982207775,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.2038672079797834,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 419,
|
|
"message": "AssertionError: Expected 0 tool calls, but got 1\nassert 1 == 0\n + where 1 = len(([ChatCompletionMessageToolCall(id='call_z4rvmn0r7oung1cu16ul3gu3', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]))\n + where [ChatCompletionMessageToolCall(id='call_z4rvmn0r7oung1cu16ul3gu3', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z4rvmn0r7oung1cu16ul3gu3', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]).tool_calls"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 419,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c03c550>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 0 tool calls, but got 1\nE assert 1 == 0\nE + where 1 = len(([ChatCompletionMessageToolCall(id='call_z4rvmn0r7oung1cu16ul3gu3', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]))\nE + where [ChatCompletionMessageToolCall(id='call_z4rvmn0r7oung1cu16ul3gu3', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_z4rvmn0r7oung1cu16ul3gu3', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:419: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002806668635457754,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007497292011976242,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.314662832999602,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002090830821543932,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.010512124979868531,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.7789271660149097,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00014504184946417809,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008220916846767068,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.6108481250703335,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00035962508991360664,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007435625186190009,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.0318919168785214,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015241606160998344,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"lineno": 360,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008867957862094045,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3960520001128316,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 447,
|
|
"message": "AssertionError: Expected one of ['sol'] in content, but got: 'I am unable to fulfill this request as the functions provided are insufficient.'\nassert False\n + where False = any(<generator object test_chat_non_streaming_multi_turn_tool_calling.<locals>.<genexpr> at 0x10c688660>)"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 447,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c175b40>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call.id,\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n assert assistant_message.content is not None, \"Expected content, but none received.\"\n expected_answers = expected[\"answer\"] # This is now a list\n content_lower = assistant_message.content.lower()\n> assert any(ans.lower() in content_lower for ans in expected_answers), (\n f\"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'\"\n )\nE AssertionError: Expected one of ['sol'] in content, but got: 'I am unable to fulfill this request as the functions provided are insufficient.'\nE assert False\nE + where False = any(<generator object test_chat_non_streaming_multi_turn_tool_calling.<locals>.<genexpr> at 0x10c688660>)\n\ntests/verifications/openai_api/test_chat_completion.py:447: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002513329964131117,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.0098578748293221,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.7098766670096666,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00051716691814363,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007647499907761812,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.932010707911104,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0001623330172151327,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.00763283297419548,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.6117105002049357,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00015487498603761196,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007260291138663888,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.2083667907863855,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00043349992483854294,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"lineno": 360,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.010255292057991028,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3150998749770224,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 447,
|
|
"message": "AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": null, \"parameters\": null}'\nassert False\n + where False = any(<generator object test_chat_non_streaming_multi_turn_tool_calling.<locals>.<genexpr> at 0x10c68b990>)"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 447,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c601db0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call.id,\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n assert assistant_message.content is not None, \"Expected content, but none received.\"\n expected_answers = expected[\"answer\"] # This is now a list\n content_lower = assistant_message.content.lower()\n> assert any(ans.lower() in content_lower for ans in expected_answers), (\n f\"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'\"\n )\nE AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": null, \"parameters\": null}'\nE assert False\nE + where False = any(<generator object test_chat_non_streaming_multi_turn_tool_calling.<locals>.<genexpr> at 0x10c68b990>)\n\ntests/verifications/openai_api/test_chat_completion.py:447: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.000294666038826108,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007977542001754045,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.5852054171264172,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0005060839466750622,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008944625034928322,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.147708958014846,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0005282082129269838,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.009134833933785558,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 3.0222986668813974,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00014937506057322025,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"lineno": 360,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008050082949921489,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.8753544169012457,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00026400014758110046,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.012623165966942906,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.3625199170783162,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c69cee0>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:527: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024533295072615147,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007315667113289237,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.8457820839248598,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]>>\nopenai_client = <openai.OpenAI object at 0x10c6bf9d0>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:527: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00028316606767475605,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"lineno": 451,
|
|
"outcome": "passed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007260374957695603,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.4652266670018435,
|
|
"outcome": "passed"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00016629090532660484,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.025101042119786143,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.8374365421477705,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]>>\nopenai_client = <openai.OpenAI object at 0x10ca6a140>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:527: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024591688998043537,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006902666063979268,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 2.5201194169931114,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 527,
|
|
"message": "AssertionError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]>>\nopenai_client = <openai.OpenAI object at 0x10ca55870>\nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:527: AssertionError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00026037520729005337,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008579750079661608,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.3671212091576308,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c6c5360>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10ca66140>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00025516608729958534,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008525707991793752,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.49603341589681804,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]>>\nopenai_client = <openai.OpenAI object at 0x10c6bc6a0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10c175ff0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00023645791225135326,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.006683999905362725,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 1.8375662080943584,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c61d5a0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10c6a32e0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024145888164639473,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.01287274993956089,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.7619118748698384,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]>>\nopenai_client = <openai.OpenAI object at 0x10ce5d0c0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10ce5c190>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00023716595023870468,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.008577040862292051,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.44602233287878335,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]>>\nopenai_client = <openai.OpenAI object at 0x10ce5c3a0>\nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10ca567d0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00022924994118511677,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "text_then_weather_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007508292095735669,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 6.219006249913946,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c175f60>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10ca75ae0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00025975005701184273,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "weather_tool_then_text"
|
|
},
|
|
"setup": {
|
|
"duration": 0.056057041976600885,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.42864158283919096,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]>>\nopenai_client = <openai.OpenAI object at 0x10ca561d0>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10c1778b0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00025275000371038914,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "add_product_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007619959069415927,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.6468547079712152,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c6c7760>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10ca40eb0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.0002552920486778021,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "get_then_create_event_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.00699983281083405,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.46285866713151336,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c637640>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10d906380>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.00024433317594230175,
|
|
"outcome": "passed"
|
|
}
|
|
},
|
|
{
|
|
"nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"lineno": 451,
|
|
"outcome": "failed",
|
|
"keywords": [
|
|
"test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]",
|
|
"parametrize",
|
|
"pytestmark",
|
|
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool",
|
|
"test_chat_completion.py",
|
|
"openai_api",
|
|
"verifications",
|
|
"tests",
|
|
"llama-stack",
|
|
""
|
|
],
|
|
"metadata": {
|
|
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
|
"case_id": "compare_monthly_expense_tool"
|
|
},
|
|
"setup": {
|
|
"duration": 0.007548208115622401,
|
|
"outcome": "passed"
|
|
},
|
|
"call": {
|
|
"duration": 0.502064208034426,
|
|
"outcome": "failed",
|
|
"crash": {
|
|
"path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError: list index out of range"
|
|
},
|
|
"traceback": [
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 486,
|
|
"message": ""
|
|
},
|
|
{
|
|
"path": "tests/verifications/openai_api/test_chat_completion.py",
|
|
"lineno": 588,
|
|
"message": "IndexError"
|
|
}
|
|
],
|
|
"longrepr": "request = <FixtureRequest for <Function test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]>>\nopenai_client = <openai.OpenAI object at 0x10c602b30>\nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:486: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = <openai.Stream object at 0x10caaedd0>\n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:588: IndexError"
|
|
},
|
|
"teardown": {
|
|
"duration": 0.001067916164174676,
|
|
"outcome": "passed"
|
|
}
|
|
}
|
|
],
|
|
"run_timestamp": 1744841031
|
|
}
|