diff --git a/tests/verifications/REPORT.md b/tests/verifications/REPORT.md index 2309c6404..2dd0af41b 100644 --- a/tests/verifications/REPORT.md +++ b/tests/verifications/REPORT.md @@ -1,6 +1,6 @@ # Test Results Report -*Generated on: 2025-04-10 16:48:18* +*Generated on: 2025-04-14 18:11:37* *This report was generated by running `python tests/verifications/generate_report.py`* @@ -15,15 +15,15 @@ | Provider | Pass Rate | Tests Passed | Total Tests | | --- | --- | --- | --- | -| Together | 64.7% | 22 | 34 | -| Fireworks | 82.4% | 28 | 34 | -| Openai | 100.0% | 24 | 24 | +| Together | 48.7% | 37 | 76 | +| Fireworks | 47.4% | 36 | 76 | +| Openai | 100.0% | 52 | 52 | ## Together -*Tests run on: 2025-04-10 16:46:35* +*Tests run on: 2025-04-14 18:08:14* ```bash # Run all tests for this provider: @@ -48,19 +48,33 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe | test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ | | test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ | | test_chat_non_streaming_image | ⚪ | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ❌ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ | +| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ | | test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ | | test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ | | test_chat_non_streaming_tool_calling | ✅ | ✅ | ✅ | +| test_chat_non_streaming_tool_choice_none | ❌ | ❌ | ❌ | +| test_chat_non_streaming_tool_choice_required | ✅ | ✅ | ✅ | | test_chat_streaming_basic (earth) | ✅ | ❌ | ❌ | | test_chat_streaming_basic (saturn) | ✅ | ❌ | ❌ | | test_chat_streaming_image | ⚪ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ | | test_chat_streaming_structured_output (calendar) | ✅ | ❌ | ❌ | | test_chat_streaming_structured_output (math) | ✅ | ❌ | ❌ | | test_chat_streaming_tool_calling | ✅ | ❌ | ❌ | +| test_chat_streaming_tool_choice_none | ❌ | ❌ | ❌ | +| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ | ## Fireworks -*Tests run on: 2025-04-10 16:44:44* +*Tests run on: 2025-04-14 18:04:06* ```bash # Run all tests for this provider: @@ -85,19 +99,33 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=firewor | test_chat_non_streaming_basic (earth) | ✅ | ✅ | ✅ | | test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ | | test_chat_non_streaming_image | ⚪ | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ | +| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ | +| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ | +| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ | +| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ | | test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ | | test_chat_non_streaming_structured_output (math) | ✅ | ✅ | ✅ | | test_chat_non_streaming_tool_calling | ❌ | ❌ | ❌ | +| test_chat_non_streaming_tool_choice_none | ✅ | ✅ | ✅ | +| test_chat_non_streaming_tool_choice_required | ✅ | ❌ | ❌ | | test_chat_streaming_basic (earth) | ✅ | ✅ | ✅ | | test_chat_streaming_basic (saturn) | ✅ | ✅ | ✅ | | test_chat_streaming_image | ⚪ | ✅ | ✅ | +| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ❌ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ❌ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ | +| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ❌ | ❌ | ❌ | | test_chat_streaming_structured_output (calendar) | ✅ | ✅ | ✅ | | test_chat_streaming_structured_output (math) | ✅ | ✅ | ✅ | | test_chat_streaming_tool_calling | ❌ | ❌ | ❌ | +| test_chat_streaming_tool_choice_none | ✅ | ✅ | ✅ | +| test_chat_streaming_tool_choice_required | ✅ | ❌ | ❌ | ## Openai -*Tests run on: 2025-04-10 16:47:28* +*Tests run on: 2025-04-14 18:09:51* ```bash # Run all tests for this provider: @@ -121,12 +149,26 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=openai | test_chat_non_streaming_basic (earth) | ✅ | ✅ | | test_chat_non_streaming_basic (saturn) | ✅ | ✅ | | test_chat_non_streaming_image | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ | +| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | | test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | | test_chat_non_streaming_structured_output (math) | ✅ | ✅ | | test_chat_non_streaming_tool_calling | ✅ | ✅ | +| test_chat_non_streaming_tool_choice_none | ✅ | ✅ | +| test_chat_non_streaming_tool_choice_required | ✅ | ✅ | | test_chat_streaming_basic (earth) | ✅ | ✅ | | test_chat_streaming_basic (saturn) | ✅ | ✅ | | test_chat_streaming_image | ✅ | ✅ | +| test_chat_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | +| test_chat_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ | +| test_chat_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ | +| test_chat_streaming_multi_turn_tool_calling (text_then_weather_tool) | ✅ | ✅ | +| test_chat_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | | test_chat_streaming_structured_output (calendar) | ✅ | ✅ | | test_chat_streaming_structured_output (math) | ✅ | ✅ | | test_chat_streaming_tool_calling | ✅ | ✅ | +| test_chat_streaming_tool_choice_none | ✅ | ✅ | +| test_chat_streaming_tool_choice_required | ✅ | ✅ | diff --git a/tests/verifications/openai_api/fixtures/test_cases/chat_completion.yaml b/tests/verifications/openai_api/fixtures/test_cases/chat_completion.yaml index 78ea8245d..1ace76e34 100644 --- a/tests/verifications/openai_api/fixtures/test_cases/chat_completion.yaml +++ b/tests/verifications/openai_api/fixtures/test_cases/chat_completion.yaml @@ -131,3 +131,221 @@ test_tool_calling: type: object type: function output: get_weather_tool_call + +test_chat_multi_turn_tool_calling: + test_name: test_chat_multi_turn_tool_calling + test_params: + case: + - case_id: "text_then_weather_tool" + input: + messages: + - - role: user + content: "What's the name of the Sun in latin?" + - - role: user + content: "What's the weather like in San Francisco?" + tools: + - function: + description: Get the current weather + name: get_weather + parameters: + type: object + properties: + location: + description: "The city and state (both required), e.g. San Francisco, CA." + type: string + required: ["location"] + type: function + tool_responses: + - response: "{'response': '70 degrees and foggy'}" + expected: + - num_tool_calls: 0 + answer: ["sol"] + - num_tool_calls: 1 + tool_name: get_weather + tool_arguments: + location: "San Francisco, CA" + - num_tool_calls: 0 + answer: ["foggy", "70 degrees"] + - case_id: "weather_tool_then_text" + input: + messages: + - - role: user + content: "What's the weather like in San Francisco?" + tools: + - function: + description: Get the current weather + name: get_weather + parameters: + type: object + properties: + location: + description: "The city and state (both required), e.g. San Francisco, CA." + type: string + required: ["location"] + type: function + tool_responses: + - response: "{'response': '70 degrees and foggy'}" + expected: + - num_tool_calls: 1 + tool_name: get_weather + tool_arguments: + location: "San Francisco, CA" + - num_tool_calls: 0 + answer: ["foggy", "70 degrees"] + - case_id: "add_product_tool" + input: + messages: + - - role: user + content: "Please add a new product with name 'Widget', price 19.99, in stock, and tags ['new', 'sale'] and give me the product id." + tools: + - function: + description: Add a new product + name: addProduct + parameters: + type: object + properties: + name: + description: "Name of the product" + type: string + price: + description: "Price of the product" + type: number + inStock: + description: "Availability status of the product." + type: boolean + tags: + description: "List of product tags" + type: array + items: + type: string + required: ["name", "price", "inStock"] + type: function + tool_responses: + - response: "{'response': 'Successfully added product with id: 123'}" + expected: + - num_tool_calls: 1 + tool_name: addProduct + tool_arguments: + name: "Widget" + price: 19.99 + inStock: true + tags: + - "new" + - "sale" + - num_tool_calls: 0 + answer: ["123", "product id: 123"] + - case_id: "get_then_create_event_tool" + input: + messages: + - - role: system + content: "Todays date is 2025-03-01." + - role: user + content: "Do i have any meetings on March 3rd at 10 am? Yes or no?" + - - role: user + content: "Alright then, Create an event named 'Team Building', scheduled for that time same time, in the 'Main Conference Room' and add Alice, Bob, Charlie to it. Give me the created event id." + tools: + - function: + description: Create a new event + name: create_event + parameters: + type: object + properties: + name: + description: "Name of the event" + type: string + date: + description: "Date of the event in ISO format" + type: string + time: + description: "Event Time (HH:MM)" + type: string + location: + description: "Location of the event" + type: string + participants: + description: "List of participant names" + type: array + items: + type: string + required: ["name", "date", "time", "location", "participants"] + type: function + - function: + description: Get an event by date and time + name: get_event + parameters: + type: object + properties: + date: + description: "Date of the event in ISO format" + type: string + time: + description: "Event Time (HH:MM)" + type: string + required: ["date", "time"] + type: function + tool_responses: + - response: "{'response': 'No events found for 2025-03-03 at 10:00'}" + - response: "{'response': 'Successfully created new event with id: e_123'}" + expected: + - num_tool_calls: 1 + tool_name: get_event + tool_arguments: + date: "2025-03-03" + time: "10:00" + - num_tool_calls: 0 + answer: ["no", "no events found", "no meetings"] + - num_tool_calls: 1 + tool_name: create_event + tool_arguments: + name: "Team Building" + date: "2025-03-03" + time: "10:00" + location: "Main Conference Room" + participants: + - "Alice" + - "Bob" + - "Charlie" + - num_tool_calls: 0 + answer: ["e_123", "event id: e_123"] + - case_id: "compare_monthly_expense_tool" + input: + messages: + - - role: system + content: "Todays date is 2025-03-01." + - role: user + content: "what was my monthly expense in Jan of this year?" + - - role: user + content: "Was it less than Feb of last year? Only answer with yes or no." + tools: + - function: + description: Get monthly expense summary + name: getMonthlyExpenseSummary + parameters: + type: object + properties: + month: + description: "Month of the year (1-12)" + type: integer + year: + description: "Year" + type: integer + required: ["month", "year"] + type: function + tool_responses: + - response: "{'response': 'Total expenses for January 2025: $1000'}" + - response: "{'response': 'Total expenses for February 2024: $2000'}" + expected: + - num_tool_calls: 1 + tool_name: getMonthlyExpenseSummary + tool_arguments: + month: 1 + year: 2025 + - num_tool_calls: 0 + answer: ["1000", "$1,000", "1,000"] + - num_tool_calls: 1 + tool_name: getMonthlyExpenseSummary + tool_arguments: + month: 2 + year: 2024 + - num_tool_calls: 0 + answer: ["yes"] diff --git a/tests/verifications/openai_api/test_chat_completion.py b/tests/verifications/openai_api/test_chat_completion.py index 6aee29c3a..62a223afb 100644 --- a/tests/verifications/openai_api/test_chat_completion.py +++ b/tests/verifications/openai_api/test_chat_completion.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import copy import json import re from typing import Any @@ -243,43 +244,294 @@ def test_chat_streaming_tool_calling(request, openai_client, model, provider, ve stream=True, ) - # Accumulate partial tool_calls here - tool_calls_buffer = {} - current_id = None - # Process streaming chunks - for chunk in stream: - choice = chunk.choices[0] - delta = choice.delta - - if delta.tool_calls is None: - continue - - for tool_call_delta in delta.tool_calls: - if tool_call_delta.id: - current_id = tool_call_delta.id - call_id = current_id - func_delta = tool_call_delta.function - - if call_id not in tool_calls_buffer: - tool_calls_buffer[call_id] = { - "id": call_id, - "type": tool_call_delta.type, - "name": func_delta.name, - "arguments": "", - } - - if func_delta.arguments: - tool_calls_buffer[call_id]["arguments"] += func_delta.arguments - + _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream) assert len(tool_calls_buffer) == 1 - for call in tool_calls_buffer.values(): + for call in tool_calls_buffer: assert len(call["id"]) > 0 - assert call["name"] == "get_weather" + function = call["function"] + assert function["name"] == "get_weather" - args_dict = json.loads(call["arguments"]) + args_dict = json.loads(function["arguments"]) assert "san francisco" in args_dict["location"].lower() +@pytest.mark.parametrize( + "case", + chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now + ids=case_id_generator, +) +def test_chat_non_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + response = openai_client.chat.completions.create( + model=model, + messages=case["input"]["messages"], + tools=case["input"]["tools"], + tool_choice="required", # Force tool call + stream=False, + ) + print(response) + + assert response.choices[0].message.role == "assistant" + assert len(response.choices[0].message.tool_calls) > 0, "Expected tool call when tool_choice='required'" + expected_tool_name = case["input"]["tools"][0]["function"]["name"] + assert response.choices[0].message.tool_calls[0].function.name == expected_tool_name + + +@pytest.mark.parametrize( + "case", + chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now + ids=case_id_generator, +) +def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + stream = openai_client.chat.completions.create( + model=model, + messages=case["input"]["messages"], + tools=case["input"]["tools"], + tool_choice="required", # Force tool call + stream=True, + ) + + _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream) + + assert len(tool_calls_buffer) > 0, "Expected tool call when tool_choice='required'" + expected_tool_name = case["input"]["tools"][0]["function"]["name"] + assert any(call["function"]["name"] == expected_tool_name for call in tool_calls_buffer), ( + f"Expected tool call '{expected_tool_name}' not found in stream" + ) + + +@pytest.mark.parametrize( + "case", + chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now + ids=case_id_generator, +) +def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + response = openai_client.chat.completions.create( + model=model, + messages=case["input"]["messages"], + tools=case["input"]["tools"], + tool_choice="none", + stream=False, + ) + + assert response.choices[0].message.role == "assistant" + assert response.choices[0].message.tool_calls is None, "Expected no tool calls when tool_choice='none'" + assert response.choices[0].message.content is not None, "Expected content when tool_choice='none'" + + +@pytest.mark.parametrize( + "case", + chat_completion_test_cases["test_tool_calling"]["test_params"]["case"], # Reusing existing case for now + ids=case_id_generator, +) +def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case): + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + stream = openai_client.chat.completions.create( + model=model, + messages=case["input"]["messages"], + tools=case["input"]["tools"], + tool_choice="none", + stream=True, + ) + + content = "" + for chunk in stream: + delta = chunk.choices[0].delta + if delta.content: + content += delta.content + assert not delta.tool_calls, "Expected no tool call chunks when tool_choice='none'" + + assert len(content) > 0, "Expected content when tool_choice='none'" + + +@pytest.mark.parametrize( + "case", + chat_completion_test_cases.get("test_chat_multi_turn_tool_calling", {}).get("test_params", {}).get("case", []), + ids=case_id_generator, +) +def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case): + """ + Test cases for multi-turn tool calling. + Tool calls are asserted. + Tool responses are provided in the test case. + Final response is asserted. + """ + + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + # Create a copy of the messages list to avoid modifying the original + messages = [] + tools = case["input"]["tools"] + # Use deepcopy to prevent modification across runs/parametrization + expected_results = copy.deepcopy(case["expected"]) + tool_responses = copy.deepcopy(case.get("tool_responses", [])) + input_messages_turns = copy.deepcopy(case["input"]["messages"]) + + # keep going until either + # 1. we have messages to test in multi-turn + # 2. no messages but last message is tool response + while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"): + # do not take new messages if last message is tool response + if len(messages) == 0 or messages[-1]["role"] != "tool": + new_messages = input_messages_turns.pop(0) + # Ensure new_messages is a list of message objects + if isinstance(new_messages, list): + messages.extend(new_messages) + else: + # If it's a single message object, add it directly + messages.append(new_messages) + + # --- API Call --- + response = openai_client.chat.completions.create( + model=model, + messages=messages, + tools=tools, + stream=False, + ) + + # --- Process Response --- + assistant_message = response.choices[0].message + messages.append(assistant_message.model_dump(exclude_unset=True)) + + assert assistant_message.role == "assistant" + + # Get the expected result data + expected = expected_results.pop(0) + num_tool_calls = expected["num_tool_calls"] + + # --- Assertions based on expected result --- + assert len(assistant_message.tool_calls or []) == num_tool_calls, ( + f"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}" + ) + + if num_tool_calls > 0: + tool_call = assistant_message.tool_calls[0] + assert tool_call.function.name == expected["tool_name"], ( + f"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'" + ) + # Parse the JSON string arguments before comparing + actual_arguments = json.loads(tool_call.function.arguments) + assert actual_arguments == expected["tool_arguments"], ( + f"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'" + ) + + # Prepare and append the tool response for the next turn + tool_response = tool_responses.pop(0) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": tool_response["response"], + } + ) + else: + assert assistant_message.content is not None, "Expected content, but none received." + expected_answers = expected["answer"] # This is now a list + content_lower = assistant_message.content.lower() + assert any(ans.lower() in content_lower for ans in expected_answers), ( + f"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'" + ) + + +@pytest.mark.parametrize( + "case", + chat_completion_test_cases.get("test_chat_multi_turn_tool_calling", {}).get("test_params", {}).get("case", []), + ids=case_id_generator, +) +def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case): + """ """ + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + messages = [] + tools = case["input"]["tools"] + expected_results = copy.deepcopy(case["expected"]) + tool_responses = copy.deepcopy(case.get("tool_responses", [])) + input_messages_turns = copy.deepcopy(case["input"]["messages"]) + + while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"): + if len(messages) == 0 or messages[-1]["role"] != "tool": + new_messages = input_messages_turns.pop(0) + if isinstance(new_messages, list): + messages.extend(new_messages) + else: + messages.append(new_messages) + + # --- API Call (Streaming) --- + stream = openai_client.chat.completions.create( + model=model, + messages=messages, + tools=tools, + stream=True, + ) + + # --- Process Stream --- + accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream) + + # --- Construct Assistant Message for History --- + assistant_message_dict = {"role": "assistant"} + if accumulated_content: + assistant_message_dict["content"] = accumulated_content + if accumulated_tool_calls: + assistant_message_dict["tool_calls"] = accumulated_tool_calls + + messages.append(assistant_message_dict) + + # --- Assertions --- + expected = expected_results.pop(0) + num_tool_calls = expected["num_tool_calls"] + + assert len(accumulated_tool_calls or []) == num_tool_calls, ( + f"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}" + ) + + if num_tool_calls > 0: + # Use the first accumulated tool call for assertion + tool_call = accumulated_tool_calls[0] + assert tool_call["function"]["name"] == expected["tool_name"], ( + f"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'" + ) + # Parse the accumulated arguments string for comparison + actual_arguments = json.loads(tool_call["function"]["arguments"]) + assert actual_arguments == expected["tool_arguments"], ( + f"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'" + ) + + # Prepare and append the tool response for the next turn + tool_response = tool_responses.pop(0) + messages.append( + { + "role": "tool", + "tool_call_id": tool_call["id"], + "content": tool_response["response"], + } + ) + else: + assert accumulated_content is not None and accumulated_content != "", "Expected content, but none received." + expected_answers = expected["answer"] + content_lower = accumulated_content.lower() + assert any(ans.lower() in content_lower for ans in expected_answers), ( + f"Expected one of {expected_answers} in content, but got: '{accumulated_content}'" + ) + + # --- Helper functions (structured output validation) --- @@ -324,3 +576,47 @@ def validate_structured_output(maybe_json_content: str, schema_name: str) -> Non assert len(structured_output.participants) == 2 elif schema_name == "valid_math_reasoning": assert len(structured_output.final_answer) > 0 + + +def _accumulate_streaming_tool_calls(stream): + """Accumulates tool calls and content from a streaming ChatCompletion response.""" + tool_calls_buffer = {} + current_id = None + full_content = "" # Initialize content accumulator + # Process streaming chunks + for chunk in stream: + choice = chunk.choices[0] + delta = choice.delta + + # Accumulate content + if delta.content: + full_content += delta.content + + if delta.tool_calls is None: + continue + + for tool_call_delta in delta.tool_calls: + if tool_call_delta.id: + current_id = tool_call_delta.id + call_id = current_id + # Skip if no ID seen yet for this tool call delta + if not call_id: + continue + func_delta = tool_call_delta.function + + if call_id not in tool_calls_buffer: + tool_calls_buffer[call_id] = { + "id": call_id, + "type": "function", # Assume function type + "function": {"name": None, "arguments": ""}, # Nested structure + } + + # Accumulate name and arguments into the nested function dict + if func_delta: + if func_delta.name: + tool_calls_buffer[call_id]["function"]["name"] = func_delta.name + if func_delta.arguments: + tool_calls_buffer[call_id]["function"]["arguments"] += func_delta.arguments + + # Return content and tool calls as a list + return full_content, list(tool_calls_buffer.values()) diff --git a/tests/verifications/test_results/fireworks.json b/tests/verifications/test_results/fireworks.json index 061e44c08..1fb6cb1b4 100644 --- a/tests/verifications/test_results/fireworks.json +++ b/tests/verifications/test_results/fireworks.json @@ -1,15 +1,15 @@ { - "created": 1744328795.171092, - "duration": 107.57908606529236, + "created": 1744679294.344288, + "duration": 243.49469900131226, "exitcode": 1, "root": "/Users/erichuang/projects/llama-stack", "environment": {}, "summary": { - "passed": 28, + "passed": 36, "skipped": 2, - "failed": 6, - "total": 36, - "collected": 36 + "failed": 40, + "total": 78, + "collected": 78 }, "collectors": [ { @@ -29,182 +29,392 @@ { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 } ] } @@ -212,7 +422,7 @@ "tests": [ { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-earth]", @@ -231,21 +441,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.2175025000469759, + "duration": 0.2540216660127044, "outcome": "passed" }, "call": { - "duration": 0.7433859170414507, + "duration": 0.6861197501420975, "outcome": "passed" }, "teardown": { - "duration": 0.0001592918997630477, + "duration": 0.00015208404511213303, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-saturn]", @@ -264,21 +474,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.007383499993011355, + "duration": 0.006722707999870181, "outcome": "passed" }, "call": { - "duration": 0.5949292909353971, + "duration": 0.5997684169560671, "outcome": "passed" }, "teardown": { - "duration": 0.00015891704242676497, + "duration": 0.0002298750914633274, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-earth]", @@ -297,21 +507,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.010730999987572432, + "duration": 0.015468083089217544, "outcome": "passed" }, "call": { - "duration": 0.8945954169612378, + "duration": 0.4625723329372704, "outcome": "passed" }, "teardown": { - "duration": 0.0003751249751076102, + "duration": 0.0003302919212728739, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-saturn]", @@ -330,21 +540,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.01665666699409485, + "duration": 0.014780875062569976, "outcome": "passed" }, "call": { - "duration": 0.907927209045738, + "duration": 0.4616922920104116, "outcome": "passed" }, "teardown": { - "duration": 0.00024874997325241566, + "duration": 0.0004110001027584076, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-earth]", @@ -363,21 +573,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.01039199996739626, + "duration": 0.016551292035728693, "outcome": "passed" }, "call": { - "duration": 0.5971567500382662, + "duration": 0.9366653750184923, "outcome": "passed" }, "teardown": { - "duration": 0.0003488330403342843, + "duration": 0.00045104208402335644, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-saturn]", @@ -396,21 +606,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.018627874902449548, + "duration": 0.043513541808351874, "outcome": "passed" }, "call": { - "duration": 2.0586736251134425, + "duration": 0.5119727500714362, "outcome": "passed" }, "teardown": { - "duration": 0.00046974990982562304, + "duration": 0.00016754190437495708, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-earth]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-earth]", @@ -429,21 +639,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.01706262503284961, + "duration": 0.008419709047302604, "outcome": "passed" }, "call": { - "duration": 0.6679969580145553, + "duration": 0.7933078748174012, "outcome": "passed" }, "teardown": { - "duration": 0.0004670419730246067, + "duration": 0.00016583292745053768, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[accounts/fireworks/models/llama-v3p3-70b-instruct-saturn]", @@ -462,21 +672,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.025956374942325056, + "duration": 0.013550583040341735, "outcome": "passed" }, "call": { - "duration": 2.052679874934256, + "duration": 0.6633435001131147, "outcome": "passed" }, "teardown": { - "duration": 0.00026958296075463295, + "duration": 0.00023925001733005047, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-earth]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-earth]", @@ -495,21 +705,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.015856957994401455, + "duration": 0.007293834118172526, "outcome": "passed" }, "call": { - "duration": 0.3096678329166025, + "duration": 0.5193503750488162, "outcome": "passed" }, "teardown": { - "duration": 0.0007620420074090362, + "duration": 0.00018516601994633675, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[accounts/fireworks/models/llama4-scout-instruct-basic-saturn]", @@ -528,21 +738,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.013509334065020084, + "duration": 0.009030540939420462, "outcome": "passed" }, "call": { - "duration": 0.5914681670255959, + "duration": 0.4338789170142263, "outcome": "passed" }, "teardown": { - "duration": 0.0002906669396907091, + "duration": 0.0004670829512178898, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-earth]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-earth]", @@ -561,21 +771,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.013216375024057925, + "duration": 0.01854533306322992, "outcome": "passed" }, "call": { - "duration": 1.8804527079919353, + "duration": 1.0042304168455303, "outcome": "passed" }, "teardown": { - "duration": 0.0002026669681072235, + "duration": 0.0004844998475164175, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[accounts/fireworks/models/llama4-maverick-instruct-basic-saturn]", @@ -594,21 +804,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.00827441702131182, + "duration": 0.018001709133386612, "outcome": "passed" }, "call": { - "duration": 0.7407040420221165, + "duration": 0.5567380839493126, "outcome": "passed" }, "teardown": { - "duration": 0.0005084159784018993, + "duration": 0.00015412503853440285, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", - "lineno": 116, + "lineno": 117, "outcome": "skipped", "keywords": [ "test_chat_non_streaming_image[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", @@ -627,22 +837,22 @@ "case_id": "case0" }, "setup": { - "duration": 0.012424499960616231, + "duration": 0.008420375175774097, "outcome": "passed" }, "call": { - "duration": 0.00032762496266514063, + "duration": 0.00015591713599860668, "outcome": "skipped", - "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 125, 'Skipped: Skipping test_chat_non_streaming_image for model accounts/fireworks/models/llama-v3p3-70b-instruct on provider fireworks based on config.')" + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 126, 'Skipped: Skipping test_chat_non_streaming_image for model accounts/fireworks/models/llama-v3p3-70b-instruct on provider fireworks based on config.')" }, "teardown": { - "duration": 0.00032416603062301874, + "duration": 0.0001371251419186592, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", - "lineno": 116, + "lineno": 117, "outcome": "passed", "keywords": [ "test_chat_non_streaming_image[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", @@ -661,21 +871,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.02253958396613598, + "duration": 0.00672045792452991, "outcome": "passed" }, "call": { - "duration": 2.64042466704268, + "duration": 1.790064417058602, "outcome": "passed" }, "teardown": { - "duration": 0.0003636250039562583, + "duration": 0.0004657919052988291, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", - "lineno": 116, + "lineno": 117, "outcome": "passed", "keywords": [ "test_chat_non_streaming_image[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", @@ -694,21 +904,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.014634749968536198, + "duration": 0.015534916892647743, "outcome": "passed" }, "call": { - "duration": 5.126485540997237, + "duration": 3.2250108749140054, "outcome": "passed" }, "teardown": { - "duration": 0.0002988330088555813, + "duration": 0.00038420804776251316, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", - "lineno": 135, + "lineno": 136, "outcome": "skipped", "keywords": [ "test_chat_streaming_image[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", @@ -727,22 +937,22 @@ "case_id": "case0" }, "setup": { - "duration": 0.015854416065849364, + "duration": 0.03246337501332164, "outcome": "passed" }, "call": { - "duration": 0.00038058299105614424, + "duration": 0.0005176670383661985, "outcome": "skipped", - "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 144, 'Skipped: Skipping test_chat_streaming_image for model accounts/fireworks/models/llama-v3p3-70b-instruct on provider fireworks based on config.')" + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 145, 'Skipped: Skipping test_chat_streaming_image for model accounts/fireworks/models/llama-v3p3-70b-instruct on provider fireworks based on config.')" }, "teardown": { - "duration": 0.0002689170651137829, + "duration": 0.0002715419977903366, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", - "lineno": 135, + "lineno": 136, "outcome": "passed", "keywords": [ "test_chat_streaming_image[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", @@ -761,21 +971,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.011205915943719447, + "duration": 0.12475762516260147, "outcome": "passed" }, "call": { - "duration": 3.2596546669956297, + "duration": 4.934706958010793, "outcome": "passed" }, "teardown": { - "duration": 0.0006222500232979655, + "duration": 0.00027604191564023495, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", - "lineno": 135, + "lineno": 136, "outcome": "passed", "keywords": [ "test_chat_streaming_image[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", @@ -794,21 +1004,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.016557667055167258, + "duration": 0.01025745808146894, "outcome": "passed" }, "call": { - "duration": 4.930164708988741, + "duration": 3.5653172079473734, "outcome": "passed" }, "teardown": { - "duration": 0.00048687495291233063, + "duration": 0.0005323749501258135, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-calendar]", @@ -827,21 +1037,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.00886166701093316, + "duration": 0.0553184999153018, "outcome": "passed" }, "call": { - "duration": 0.8833738330285996, + "duration": 1.366144834086299, "outcome": "passed" }, "teardown": { - "duration": 0.00025583396200090647, + "duration": 0.00042316620238125324, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-math]", @@ -860,21 +1070,21 @@ "case_id": "math" }, "setup": { - "duration": 0.01297520799562335, + "duration": 0.06981937494128942, "outcome": "passed" }, "call": { - "duration": 1.9960687910206616, + "duration": 2.829931082902476, "outcome": "passed" }, "teardown": { - "duration": 0.0005048330640420318, + "duration": 0.0003029161598533392, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-calendar]", @@ -893,21 +1103,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.007275875075720251, + "duration": 0.0244335001334548, "outcome": "passed" }, "call": { - "duration": 0.9094266659813002, + "duration": 0.7541109579615295, "outcome": "passed" }, "teardown": { - "duration": 0.00028041598852723837, + "duration": 0.0004666249733418226, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-math]", @@ -926,21 +1136,21 @@ "case_id": "math" }, "setup": { - "duration": 0.008899332955479622, + "duration": 0.016700832871720195, "outcome": "passed" }, "call": { - "duration": 3.117967874975875, + "duration": 2.208378749899566, "outcome": "passed" }, "teardown": { - "duration": 0.00017600005958229303, + "duration": 0.00016137491911649704, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-calendar]", @@ -959,21 +1169,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.0073364999843761325, + "duration": 0.006982124876230955, "outcome": "passed" }, "call": { - "duration": 2.2714374579954892, + "duration": 0.6431179158389568, "outcome": "passed" }, "teardown": { - "duration": 0.0001814159331843257, + "duration": 0.00033412501215934753, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-math]", @@ -992,21 +1202,21 @@ "case_id": "math" }, "setup": { - "duration": 0.010546459001488984, + "duration": 0.015676999930292368, "outcome": "passed" }, "call": { - "duration": 3.9954450000077486, + "duration": 4.404933541081846, "outcome": "passed" }, "teardown": { - "duration": 0.0002719159238040447, + "duration": 0.0002617498394101858, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-calendar]", @@ -1025,21 +1235,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.012508000014349818, + "duration": 0.07572970795445144, "outcome": "passed" }, "call": { - "duration": 9.095425167004578, + "duration": 1.1367775409016758, "outcome": "passed" }, "teardown": { - "duration": 0.00029200001154094934, + "duration": 0.0006681671366095543, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-math]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[accounts/fireworks/models/llama-v3p3-70b-instruct-math]", @@ -1058,21 +1268,21 @@ "case_id": "math" }, "setup": { - "duration": 0.014769250061362982, + "duration": 0.028525790898129344, "outcome": "passed" }, "call": { - "duration": 1.9875252910424024, + "duration": 2.1424834579229355, "outcome": "passed" }, "teardown": { - "duration": 0.0006288329605013132, + "duration": 0.0003642500378191471, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-calendar]", @@ -1091,21 +1301,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.014440709026530385, + "duration": 0.0146782910451293, "outcome": "passed" }, "call": { - "duration": 1.2613736250204965, + "duration": 15.13383225002326, "outcome": "passed" }, "teardown": { - "duration": 0.0001937919296324253, + "duration": 0.00045950012281537056, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-math]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[accounts/fireworks/models/llama4-scout-instruct-basic-math]", @@ -1124,21 +1334,21 @@ "case_id": "math" }, "setup": { - "duration": 0.0071510839043185115, + "duration": 0.01714799995534122, "outcome": "passed" }, "call": { - "duration": 2.2953888749470934, + "duration": 10.714752790983766, "outcome": "passed" }, "teardown": { - "duration": 0.00016245793085545301, + "duration": 0.00027029216289520264, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-calendar]", @@ -1157,21 +1367,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.007294666953384876, + "duration": 0.010765291983261704, "outcome": "passed" }, "call": { - "duration": 2.194703874993138, + "duration": 0.6682700838427991, "outcome": "passed" }, "teardown": { - "duration": 0.00017604196909815073, + "duration": 0.00015808409079909325, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-math]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[accounts/fireworks/models/llama4-maverick-instruct-basic-math]", @@ -1190,21 +1400,21 @@ "case_id": "math" }, "setup": { - "duration": 0.019950625021010637, + "duration": 0.0071080829948186874, "outcome": "passed" }, "call": { - "duration": 8.4994609169662, + "duration": 1.9725822920445353, "outcome": "passed" }, "teardown": { - "duration": 0.00026404205709695816, + "duration": 0.0004201668780297041, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", - "lineno": 204, + "lineno": 205, "outcome": "failed", "keywords": [ "test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", @@ -1223,34 +1433,34 @@ "case_id": "case0" }, "setup": { - "duration": 0.011928000021725893, + "duration": 0.013940333155915141, "outcome": "passed" }, "call": { - "duration": 0.5664792089955881, + "duration": 0.5732313331682235, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 223, + "lineno": 224, "message": "TypeError: object of type 'NoneType' has no len()" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 223, + "lineno": 224, "message": "TypeError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:223: TypeError" + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:224: TypeError" }, "teardown": { - "duration": 0.00023799994960427284, + "duration": 0.00022962503135204315, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", - "lineno": 204, + "lineno": 205, "outcome": "failed", "keywords": [ "test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", @@ -1269,34 +1479,34 @@ "case_id": "case0" }, "setup": { - "duration": 0.006813624990172684, + "duration": 0.006374292075634003, "outcome": "passed" }, "call": { - "duration": 3.170418416033499, + "duration": 7.2776273330673575, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 223, + "lineno": 224, "message": "TypeError: object of type 'NoneType' has no len()" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 223, + "lineno": 224, "message": "TypeError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:223: TypeError" + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:224: TypeError" }, "teardown": { - "duration": 0.0004129580920562148, + "duration": 0.0004100420046597719, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", - "lineno": 204, + "lineno": 205, "outcome": "failed", "keywords": [ "test_chat_non_streaming_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", @@ -1315,34 +1525,34 @@ "case_id": "case0" }, "setup": { - "duration": 0.01656208303757012, + "duration": 0.012761292047798634, "outcome": "passed" }, "call": { - "duration": 22.76337137504015, + "duration": 0.8920639578718692, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 223, + "lineno": 224, "message": "TypeError: object of type 'NoneType' has no len()" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 223, + "lineno": 224, "message": "TypeError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:223: TypeError" + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:224: TypeError" }, "teardown": { - "duration": 0.00038704206235706806, + "duration": 0.0004124999977648258, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", - "lineno": 228, + "lineno": 229, "outcome": "failed", "keywords": [ "test_chat_streaming_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", @@ -1361,34 +1571,34 @@ "case_id": "case0" }, "setup": { - "duration": 0.015727541991509497, + "duration": 0.013205124996602535, "outcome": "passed" }, "call": { - "duration": 0.5719050420448184, + "duration": 1.930448625003919, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 274, - "message": "assert 0 == 1\n + where 0 = len({})" + "lineno": 248, + "message": "assert 0 == 1\n + where 0 = len([])" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 274, + "lineno": 248, "message": "AssertionError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n # Accumulate partial tool_calls here\n tool_calls_buffer = {}\n current_id = None\n # Process streaming chunks\n for chunk in stream:\n choice = chunk.choices[0]\n delta = choice.delta\n \n if delta.tool_calls is None:\n continue\n \n for tool_call_delta in delta.tool_calls:\n if tool_call_delta.id:\n current_id = tool_call_delta.id\n call_id = current_id\n func_delta = tool_call_delta.function\n \n if call_id not in tool_calls_buffer:\n tool_calls_buffer[call_id] = {\n \"id\": call_id,\n \"type\": tool_call_delta.type,\n \"name\": func_delta.name,\n \"arguments\": \"\",\n }\n \n if func_delta.arguments:\n tool_calls_buffer[call_id][\"arguments\"] += func_delta.arguments\n \n> assert len(tool_calls_buffer) == 1\nE assert 0 == 1\nE + where 0 = len({})\n\ntests/verifications/openai_api/test_chat_completion.py:274: AssertionError" + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n> assert len(tool_calls_buffer) == 1\nE assert 0 == 1\nE + where 0 = len([])\n\ntests/verifications/openai_api/test_chat_completion.py:248: AssertionError" }, "teardown": { - "duration": 0.0003532909322530031, + "duration": 0.0005771249998360872, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", - "lineno": 228, + "lineno": 229, "outcome": "failed", "keywords": [ "test_chat_streaming_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", @@ -1407,34 +1617,34 @@ "case_id": "case0" }, "setup": { - "duration": 0.011914041941054165, + "duration": 0.01408083294518292, "outcome": "passed" }, "call": { - "duration": 5.403063916950487, + "duration": 10.029349042102695, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 274, - "message": "assert 0 == 1\n + where 0 = len({})" + "lineno": 248, + "message": "assert 0 == 1\n + where 0 = len([])" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 274, + "lineno": 248, "message": "AssertionError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n # Accumulate partial tool_calls here\n tool_calls_buffer = {}\n current_id = None\n # Process streaming chunks\n for chunk in stream:\n choice = chunk.choices[0]\n delta = choice.delta\n \n if delta.tool_calls is None:\n continue\n \n for tool_call_delta in delta.tool_calls:\n if tool_call_delta.id:\n current_id = tool_call_delta.id\n call_id = current_id\n func_delta = tool_call_delta.function\n \n if call_id not in tool_calls_buffer:\n tool_calls_buffer[call_id] = {\n \"id\": call_id,\n \"type\": tool_call_delta.type,\n \"name\": func_delta.name,\n \"arguments\": \"\",\n }\n \n if func_delta.arguments:\n tool_calls_buffer[call_id][\"arguments\"] += func_delta.arguments\n \n> assert len(tool_calls_buffer) == 1\nE assert 0 == 1\nE + where 0 = len({})\n\ntests/verifications/openai_api/test_chat_completion.py:274: AssertionError" + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n> assert len(tool_calls_buffer) == 1\nE assert 0 == 1\nE + where 0 = len([])\n\ntests/verifications/openai_api/test_chat_completion.py:248: AssertionError" }, "teardown": { - "duration": 0.0005193749675527215, + "duration": 0.0004449589177966118, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", - "lineno": 228, + "lineno": 229, "outcome": "failed", "keywords": [ "test_chat_streaming_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", @@ -1453,31 +1663,1859 @@ "case_id": "case0" }, "setup": { - "duration": 0.012608832912519574, + "duration": 0.013213291997089982, "outcome": "passed" }, "call": { - "duration": 7.587262416025624, + "duration": 8.608150291023776, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 274, - "message": "assert 0 == 1\n + where 0 = len({})" + "lineno": 248, + "message": "assert 0 == 1\n + where 0 = len([])" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 274, + "lineno": 248, "message": "AssertionError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n # Accumulate partial tool_calls here\n tool_calls_buffer = {}\n current_id = None\n # Process streaming chunks\n for chunk in stream:\n choice = chunk.choices[0]\n delta = choice.delta\n \n if delta.tool_calls is None:\n continue\n \n for tool_call_delta in delta.tool_calls:\n if tool_call_delta.id:\n current_id = tool_call_delta.id\n call_id = current_id\n func_delta = tool_call_delta.function\n \n if call_id not in tool_calls_buffer:\n tool_calls_buffer[call_id] = {\n \"id\": call_id,\n \"type\": tool_call_delta.type,\n \"name\": func_delta.name,\n \"arguments\": \"\",\n }\n \n if func_delta.arguments:\n tool_calls_buffer[call_id][\"arguments\"] += func_delta.arguments\n \n> assert len(tool_calls_buffer) == 1\nE assert 0 == 1\nE + where 0 = len({})\n\ntests/verifications/openai_api/test_chat_completion.py:274: AssertionError" + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n> assert len(tool_calls_buffer) == 1\nE assert 0 == 1\nE + where 0 = len([])\n\ntests/verifications/openai_api/test_chat_completion.py:248: AssertionError" }, "teardown": { - "duration": 0.0008685829816386104, + "duration": 0.0005860829260200262, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "lineno": 257, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.01437820796854794, + "outcome": "passed" + }, + "call": { + "duration": 0.7105170420836657, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00017283298075199127, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "lineno": 257, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.009220415959134698, + "outcome": "passed" + }, + "call": { + "duration": 5.718667333945632, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 277, + "message": "TypeError: object of type 'NoneType' has no len()" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 277, + "message": "TypeError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0, \"Expected tool call when tool_choice='required'\"\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:277: TypeError" + }, + "teardown": { + "duration": 0.0003282078541815281, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "lineno": 257, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.014709000010043383, + "outcome": "passed" + }, + "call": { + "duration": 1.7260455000214279, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 277, + "message": "TypeError: object of type 'NoneType' has no len()" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 277, + "message": "TypeError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0, \"Expected tool call when tool_choice='required'\"\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai_api/test_chat_completion.py:277: TypeError" + }, + "teardown": { + "duration": 0.00022012507542967796, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "lineno": 281, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.008183792000636458, + "outcome": "passed" + }, + "call": { + "duration": 1.9683502500411123, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0007690000347793102, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "lineno": 281, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.014906208030879498, + "outcome": "passed" + }, + "call": { + "duration": 11.76459054206498, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 302, + "message": "AssertionError: Expected tool call when tool_choice='required'\nassert 0 > 0\n + where 0 = len([])" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 302, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n \n> assert len(tool_calls_buffer) > 0, \"Expected tool call when tool_choice='required'\"\nE AssertionError: Expected tool call when tool_choice='required'\nE assert 0 > 0\nE + where 0 = len([])\n\ntests/verifications/openai_api/test_chat_completion.py:302: AssertionError" + }, + "teardown": { + "duration": 0.0003086249344050884, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "lineno": 281, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_tool_choice_required[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.021144041791558266, + "outcome": "passed" + }, + "call": { + "duration": 2.4300453749019653, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 302, + "message": "AssertionError: Expected tool call when tool_choice='required'\nassert 0 > 0\n + where 0 = len([])" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 302, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n \n> assert len(tool_calls_buffer) > 0, \"Expected tool call when tool_choice='required'\"\nE AssertionError: Expected tool call when tool_choice='required'\nE assert 0 > 0\nE + where 0 = len([])\n\ntests/verifications/openai_api/test_chat_completion.py:302: AssertionError" + }, + "teardown": { + "duration": 0.00037800008431077003, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "lineno": 308, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.007929167011752725, + "outcome": "passed" + }, + "call": { + "duration": 1.0130669160280377, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004307499621063471, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "lineno": 308, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.010822792071849108, + "outcome": "passed" + }, + "call": { + "duration": 4.663267957977951, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006220841314643621, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "lineno": 308, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.010691167088225484, + "outcome": "passed" + }, + "call": { + "duration": 3.383276625070721, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00047616707161068916, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "lineno": 331, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama-v3p3-70b-instruct-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.030178457964211702, + "outcome": "passed" + }, + "call": { + "duration": 0.4668415829073638, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0007963338866829872, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "lineno": 331, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama4-scout-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.011727249948307872, + "outcome": "passed" + }, + "call": { + "duration": 11.540696125011891, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009242501109838486, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "lineno": 331, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_none[accounts/fireworks/models/llama4-maverick-instruct-basic-case0]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "case0" + }, + "setup": { + "duration": 0.008536209119483829, + "outcome": "passed" + }, + "call": { + "duration": 3.6622679999563843, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005495408549904823, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.017524708062410355, + "outcome": "passed" + }, + "call": { + "duration": 0.625571500044316, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 446, + "message": "AssertionError: Expected one of ['sol'] in content, but got: 'I am not able to execute this task as it exceeds the limitations of the functions I have been given.'\nassert False\n + where False = any(. at 0x1073e5cb0>)" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 446, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call.id,\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n assert assistant_message.content is not None, \"Expected content, but none received.\"\n expected_answers = expected[\"answer\"] # This is now a list\n content_lower = assistant_message.content.lower()\n> assert any(ans.lower() in content_lower for ans in expected_answers), (\n f\"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'\"\n )\nE AssertionError: Expected one of ['sol'] in content, but got: 'I am not able to execute this task as it exceeds the limitations of the functions I have been given.'\nE assert False\nE + where False = any(. at 0x1073e5cb0>)\n\ntests/verifications/openai_api/test_chat_completion.py:446: AssertionError" + }, + "teardown": { + "duration": 0.00044062500819563866, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.01056775008328259, + "outcome": "passed" + }, + "call": { + "duration": 0.5624969999771565, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"get_weather\", \"parameters\": {\"location\": \"San Francisco, CA\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"get_weather\", \"parameters\": {\"location\": \"San Francisco, CA\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0004401658661663532, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.013444249983876944, + "outcome": "passed" + }, + "call": { + "duration": 0.8705885419622064, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"addProduct\", \"parameters\": {\"name\": \"Widget\", \"price\": \"19.99\", \"inStock\": \"true\", \"tags\": \"[\\\\\"new\\\\\", \\\\\"sale\\\\\"]\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"addProduct\", \"parameters\": {\"name\": \"Widget\", \"price\": \"19.99\", \"inStock\": \"true\", \"tags\": \"[\\\\\"new\\\\\", \\\\\"sale\\\\\"]\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0004647918976843357, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.013817500090226531, + "outcome": "passed" + }, + "call": { + "duration": 0.6882082498632371, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"get_event\", \"parameters\": {\"date\": \"2025-03-03\", \"time\": \"10:00\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"get_event\", \"parameters\": {\"date\": \"2025-03-03\", \"time\": \"10:00\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0005112909711897373, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.013548000017181039, + "outcome": "passed" + }, + "call": { + "duration": 0.5821714580524713, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": \"1\", \"year\": \"2025\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": \"1\", \"year\": \"2025\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.00021225004456937313, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.0070156671572476625, + "outcome": "passed" + }, + "call": { + "duration": 8.95718324999325, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='```\\n{\\n \"name\": \"get_weather\",\\n \"parameters\": {\\n \"description\": \"Get the current weather\",\\n \"parameters\": {\\n \"location\": {\\n \"description\": \"The city and state (both required)\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"location\": {\\n \"description\": \"The city and state, e.g. San Francisco, CA.\",\\n \"type\": \"string\"\\n }\\n }\\n }\\n },\\n \"type\": \"object\",\\n \"properties\": {\\n \"location\": \"San Francisco, CA.\"\\n }\\n }\\n}\\n```', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='```\\n{\\n \"name\": \"get_weather\",\\n \"parameters\": {\\n \"description\": \"Get the current weather\",\\n \"parameters\": {\\n \"location\": {\\n \"description\": \"The city and state (both required)\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"location\": {\\n \"description\": \"The city and state, e.g. San Francisco, CA.\",\\n \"type\": \"string\"\\n }\\n }\\n }\\n },\\n \"type\": \"object\",\\n \"properties\": {\\n \"location\": \"San Francisco, CA.\"\\n }\\n }\\n}\\n```', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.00045741605572402477, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.011042665923014283, + "outcome": "passed" + }, + "call": { + "duration": 3.372867708094418, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"name\": \"get_weather\", \"parameters\": {\"description\": \"Get the current weather\", \"parameters\": {\"type\": \"object\", \"properties\": {\"location\": {\"description\": \"The city and state (both required)\", \"type\": \"string\"}}}, \"required\": [\"location\"]}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"name\": \"get_weather\", \"parameters\": {\"description\": \"Get the current weather\", \"parameters\": {\"type\": \"object\", \"properties\": {\"location\": {\"description\": \"The city and state (both required)\", \"type\": \"string\"}}}, \"required\": [\"location\"]}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.00042333384044468403, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.01305404189042747, + "outcome": "passed" + }, + "call": { + "duration": 3.5883425418287516, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"name\": \"addProduct\", \"parameters\": {\"description\": \"Add a new product\", \"type\": \"object\", \"properties\": {\"name\": {\"description\": \"Name of the product\", \"type\": \"string\"}, \"price\": {\"description\": \"Price of the product\", \"type\": \"number\"}, \"inStock\": {\"description\": \"Availability status of the product\", \"type\": \"boolean\"}, \"tags\": {\"description\": \"List of product tags\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}}, \"required\": [\"name\", \"price\", \"inStock\", \"tags\"]}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"name\": \"addProduct\", \"parameters\": {\"description\": \"Add a new product\", \"type\": \"object\", \"properties\": {\"name\": {\"description\": \"Name of the product\", \"type\": \"string\"}, \"price\": {\"description\": \"Price of the product\", \"type\": \"number\"}, \"inStock\": {\"description\": \"Availability status of the product\", \"type\": \"boolean\"}, \"tags\": {\"description\": \"List of product tags\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}}, \"required\": [\"name\", \"price\", \"inStock\", \"tags\"]}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0005818749777972698, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.01428320910781622, + "outcome": "passed" + }, + "call": { + "duration": 15.402638916159049, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event...: \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event...: \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}assistant\\n\\n{\"name\": \"get_event\", \"parameters\": {\"date\": {\"description\": \"Date of the event in ISO format\", \"type\": \"string\", \"value\": \"2025-03-03\"}, \"time\": {\"description\": \"Event Time (HH:MM)\", \"type\": \"string\", \"value\": \"10:00\"}}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0004401251208037138, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.021037542028352618, + "outcome": "passed" + }, + "call": { + "duration": 6.548705333843827, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\"}, \"year\": {\"description\": \"Year\", \"type\": \"integer\"}}}assistant\\n\\n{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\"}, \"year\": {\"description\": \"Year\", \"type\": \"integer\"}}}assistant\\n\\n{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\"}, \"year\": {\"description\": \"Year\", \"type\": \"integer\"}}}assistant\\n\\n{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\", \"value\": 1}, \"year\": {\"description\": \"Year\", \"type\": \"integer\", \"value\": 2025}}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\"}, \"year\": {\"description\": \"Year\", \"type\": \"integer\"}}}assistant\\n\\n{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\"}, \"year\": {\"description\": \"Year\", \"type\": \"integer\"}}}assistant\\n\\n{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\"}, \"year\": {\"description\": \"Year\", \"type\": \"integer\"}}}assistant\\n\\n{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": {\"description\": \"Month of the year (1-12)\", \"type\": \"integer\", \"value\": 1}, \"year\": {\"description\": \"Year\", \"type\": \"integer\", \"value\": 2025}}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.00035033305175602436, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.00768870790489018, + "outcome": "passed" + }, + "call": { + "duration": 3.410787041997537, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='To answer the question about the weather in San Francisco, we can directly utilize the provided function `get_weather` as it matches the context of the query.\\n\\nThe function `get_weather` requires a `location` parameter. Given that San Francisco is a city and assuming California (CA) is the state, we can directly fit the query into the provided function format.\\n\\nHere\\'s the response in the required JSON format:\\n\\n```json\\n{\\n \"name\": \"get_weather\",\\n \"parameters\": {\\n \"location\": \"San Francisco, CA\"\\n }\\n}\\n```', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='To answer the question about the weather in San Francisco, we can directly utilize the provided function `get_weather` as it matches the context of the query.\\n\\nThe function `get_weather` requires a `location` parameter. Given that San Francisco is a city and assuming California (CA) is the state, we can directly fit the query into the provided function format.\\n\\nHere\\'s the response in the required JSON format:\\n\\n```json\\n{\\n \"name\": \"get_weather\",\\n \"parameters\": {\\n \"location\": \"San Francisco, CA\"\\n }\\n}\\n```', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0002946250606328249, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.009200166910886765, + "outcome": "passed" + }, + "call": { + "duration": 0.5177558751311153, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"name\": \"get_weather\", \"parameters\": {\"location\": \"San Francisco, CA\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"name\": \"get_weather\", \"parameters\": {\"location\": \"San Francisco, CA\"}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.00025020912289619446, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.007124624913558364, + "outcome": "passed" + }, + "call": { + "duration": 0.6132153749931604, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"name\": \"addProduct\", \"parameters\": {\"name\": \"Widget\", \"price\": 19.99, \"inStock\": true, \"tags\": [\"new\", \"sale\"]}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"name\": \"addProduct\", \"parameters\": {\"name\": \"Widget\", \"price\": 19.99, \"inStock\": true, \"tags\": [\"new\", \"sale\"]}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0003745418507605791, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.01410404103808105, + "outcome": "passed" + }, + "call": { + "duration": 1.3956649999599904, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"get_event\", \"parameters\": {\"date\": \"2025-03-03\", \"time\": \"10:00\"}}<|uniquepaddingtoken557|>---\"\"\"\"\"\"\"\"\"---\" \" \" \"\"\" \" \" \"Interaction\"\"\\n\\nI am unable to execute this task as it exceeds the limitations of the functions I have at hand.\"', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"type\": \"function\", \"name\": \"get_event\", \"parameters\": {\"date\": \"2025-03-03\", \"time\": \"10:00\"}}<|uniquepaddingtoken557|>---\"\"\"\"\"\"\"\"\"---\" \" \" \"\"\" \" \" \"Interaction\"\"\\n\\nI am unable to execute this task as it exceeds the limitations of the functions I have at hand.\"', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.00041033304296433926, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.027331124991178513, + "outcome": "passed" + }, + "call": { + "duration": 2.465563999954611, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len((None or []))\n + where None = ChatCompletionMessage(content='{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": 1, \"year\": 2024}}\"\\n\\nThe provided JSON describes a function `getMonthlyExpenseSummary` that takes two parameters: `month` and `year`. The prompt asks for the monthly expense in January of this year. Assuming the current year is 2024, we can fill in the appropriate values for `month` and `year`.\\n\\nThe value for `month` should be `1` (January is the first month), and the value for `year` should be `2024`.\\n\\nTherefore, the appropriate function call with its arguments is:assistant\\n\\nimport datetime\\n\\n# Get the current year\\ncurrent_year = datetime.datetime.now().year\\n\\n# The function call with its arguments\\nprint({\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": 1, \"year\": current_year}})\"{\\\\\"name\\\\\": \\\\\"getMonthlyExpenseSummary\\\\\", \\\\\"parameters\\\\\": {\\\\\"month\\\\\": 1, \\\\\"year\\\\\": 2024}}\"assistant\\n\\nThe final response is: {\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": 1, \"year\": 2024}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len((None or []))\nE + where None = ChatCompletionMessage(content='{\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": 1, \"year\": 2024}}\"\\n\\nThe provided JSON describes a function `getMonthlyExpenseSummary` that takes two parameters: `month` and `year`. The prompt asks for the monthly expense in January of this year. Assuming the current year is 2024, we can fill in the appropriate values for `month` and `year`.\\n\\nThe value for `month` should be `1` (January is the first month), and the value for `year` should be `2024`.\\n\\nTherefore, the appropriate function call with its arguments is:assistant\\n\\nimport datetime\\n\\n# Get the current year\\ncurrent_year = datetime.datetime.now().year\\n\\n# The function call with its arguments\\nprint({\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": 1, \"year\": current_year}})\"{\\\\\"name\\\\\": \\\\\"getMonthlyExpenseSummary\\\\\", \\\\\"parameters\\\\\": {\\\\\"month\\\\\": 1, \\\\\"year\\\\\": 2024}}\"assistant\\n\\nThe final response is: {\"name\": \"getMonthlyExpenseSummary\", \"parameters\": {\"month\": 1, \"year\": 2024}}', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0005783340893685818, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.016343542141839862, + "outcome": "passed" + }, + "call": { + "duration": 0.6930254579056054, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 529, + "message": "AssertionError: Expected one of ['sol'] in content, but got: 'I cannot accomplish this task as it requires capabilities beyond those offered by the provided functions.'\nassert False\n + where False = any(. at 0x10738e0a0>)" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 529, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\n expected_answers = expected[\"answer\"]\n content_lower = accumulated_content.lower()\n> assert any(ans.lower() in content_lower for ans in expected_answers), (\n f\"Expected one of {expected_answers} in content, but got: '{accumulated_content}'\"\n )\nE AssertionError: Expected one of ['sol'] in content, but got: 'I cannot accomplish this task as it requires capabilities beyond those offered by the provided functions.'\nE assert False\nE + where False = any(. at 0x10738e0a0>)\n\ntests/verifications/openai_api/test_chat_completion.py:529: AssertionError" + }, + "teardown": { + "duration": 0.00024741701781749725, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.007791666081175208, + "outcome": "passed" + }, + "call": { + "duration": 0.4420052089262754, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.000628374982625246, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.013015333097428083, + "outcome": "passed" + }, + "call": { + "duration": 0.6754761249758303, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.000581083819270134, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.0128930420614779, + "outcome": "passed" + }, + "call": { + "duration": 0.367436750093475, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.00024812505580484867, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama-v3p3-70b-instruct-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama-v3p3-70b-instruct", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.006677915807813406, + "outcome": "passed" + }, + "call": { + "duration": 0.5142939588986337, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama-v3p3-70b-instruct'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.0002248329110443592, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.008392333984375, + "outcome": "passed" + }, + "call": { + "duration": 9.519045708002523, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.00019570882432162762, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.009688499849289656, + "outcome": "passed" + }, + "call": { + "duration": 0.9869634578935802, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.0002135841641575098, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.007028624881058931, + "outcome": "passed" + }, + "call": { + "duration": 4.688094082986936, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.00026954198256134987, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.006646708119660616, + "outcome": "passed" + }, + "call": { + "duration": 15.899775499943644, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.0004787910729646683, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-scout-instruct-basic-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-scout-instruct-basic", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.016487207962200046, + "outcome": "passed" + }, + "call": { + "duration": 3.922360667027533, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-scout-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.00043979217298328876, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.013401374919340014, + "outcome": "passed" + }, + "call": { + "duration": 2.2223200001753867, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 529, + "message": "AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": \"get_weather\", \"parameters\": {\"location\": \"Rome, Italy\"}} is not the best response here.\n \n Since we don't have a function that directly answers \"What's the name of the Sun in latin?\", a more appropriate response would be to say that there's no function available to answer this question. However, to follow the given format and assuming there's an implicit expectation to still attempt an answer or provide a closest match:\n \n {\"name\": \"get_weather\", \"parameters\": {\"location\": \"Invalid input, no relation to weather\"}} is still not a valid response.\n \n A correct response according to the given constraints isn't feasible. However, to fit the required format and indicating a function that could be related or a default, if there was a \"get_fact\" function:\n \n {\"name\": \"get_fact\", \"parameters\": {\"query\": \"Latin name of the Sun\"}} \n \n But since \"get_fact\" isn't defined in the prompt, and sticking strictly to the given function:\n \n There isn't a proper function to call.\n \n For the sake of compliance, let's assume an unrelated function was to be used due to lack of information.\n \n The best course of action is to indicate that the provided function definitions don't directly support answering the question about the Latin name of the Sun.'\nassert False\n + where False = any(. at 0x1074b9bd0>)" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 529, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\n expected_answers = expected[\"answer\"]\n content_lower = accumulated_content.lower()\n> assert any(ans.lower() in content_lower for ans in expected_answers), (\n f\"Expected one of {expected_answers} in content, but got: '{accumulated_content}'\"\n )\nE AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": \"get_weather\", \"parameters\": {\"location\": \"Rome, Italy\"}} is not the best response here.\nE \nE Since we don't have a function that directly answers \"What's the name of the Sun in latin?\", a more appropriate response would be to say that there's no function available to answer this question. However, to follow the given format and assuming there's an implicit expectation to still attempt an answer or provide a closest match:\nE \nE {\"name\": \"get_weather\", \"parameters\": {\"location\": \"Invalid input, no relation to weather\"}} is still not a valid response.\nE \nE A correct response according to the given constraints isn't feasible. However, to fit the required format and indicating a function that could be related or a default, if there was a \"get_fact\" function:\nE \nE {\"name\": \"get_fact\", \"parameters\": {\"query\": \"Latin name of the Sun\"}} \nE \nE But since \"get_fact\" isn't defined in the prompt, and sticking strictly to the given function:\nE \nE There isn't a proper function to call.\nE \nE For the sake of compliance, let's assume an unrelated function was to be used due to lack of information.\nE \nE The best course of action is to indicate that the provided function definitions don't directly support answering the question about the Latin name of the Sun.'\nE assert False\nE + where False = any(. at 0x1074b9bd0>)\n\ntests/verifications/openai_api/test_chat_completion.py:529: AssertionError" + }, + "teardown": { + "duration": 0.00047154095955193043, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.01485933386720717, + "outcome": "passed" + }, + "call": { + "duration": 0.6193458330817521, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.000300833024084568, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.012684250017628074, + "outcome": "passed" + }, + "call": { + "duration": 0.5173197500407696, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.00047266692854464054, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.01282945810817182, + "outcome": "passed" + }, + "call": { + "duration": 2.990155333885923, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.00027558300644159317, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "accounts/fireworks/models/llama4-maverick-instruct-basic-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "accounts/fireworks/models/llama4-maverick-instruct-basic", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.008087666006758809, + "outcome": "passed" + }, + "call": { + "duration": 3.6024099169299006, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 1 tool calls, but got 0\nassert 0 == 1\n + where 0 = len(([] or []))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\nprovider = 'fireworks'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 1 tool calls, but got 0\nE assert 0 == 1\nE + where 0 = len(([] or []))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.0010035419836640358, "outcome": "passed" } } ], - "run_timestamp": 1744328684 + "run_timestamp": 1744679046 } diff --git a/tests/verifications/test_results/openai.json b/tests/verifications/test_results/openai.json index 0c1892f7e..32a2a2b82 100644 --- a/tests/verifications/test_results/openai.json +++ b/tests/verifications/test_results/openai.json @@ -1,13 +1,13 @@ { - "created": 1744328898.0248861, - "duration": 47.561042070388794, + "created": 1744679497.440863, + "duration": 102.70424389839172, "exitcode": 0, "root": "/Users/erichuang/projects/llama-stack", "environment": {}, "summary": { - "passed": 24, - "total": 24, - "collected": 24 + "passed": 52, + "total": 52, + "collected": 52 }, "collectors": [ { @@ -27,122 +27,262 @@ { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-mini-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[gpt-4o-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[gpt-4o-mini-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[gpt-4o-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[gpt-4o-mini-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[gpt-4o-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[gpt-4o-mini-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[gpt-4o-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[gpt-4o-mini-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 } ] } @@ -150,7 +290,7 @@ "tests": [ { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[gpt-4o-earth]", @@ -169,21 +309,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.0694252080284059, + "duration": 0.09044458298012614, "outcome": "passed" }, "call": { - "duration": 0.5709165419684723, + "duration": 1.3071064590476453, "outcome": "passed" }, "teardown": { - "duration": 0.0007626248989254236, + "duration": 0.0003990421537309885, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[gpt-4o-saturn]", @@ -202,21 +342,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.010281750001013279, + "duration": 0.015266708098351955, "outcome": "passed" }, "call": { - "duration": 0.6309260830748826, + "duration": 1.3942135840188712, "outcome": "passed" }, "teardown": { - "duration": 0.0001824579667299986, + "duration": 0.0006840829737484455, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[gpt-4o-mini-earth]", @@ -235,21 +375,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.007922374992631376, + "duration": 0.028802334098145366, "outcome": "passed" }, "call": { - "duration": 0.31756504194345325, + "duration": 0.40633770800195634, "outcome": "passed" }, "teardown": { - "duration": 0.0005268750246614218, + "duration": 0.0006945421919226646, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[gpt-4o-mini-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[gpt-4o-mini-saturn]", @@ -268,21 +408,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.01643404201604426, + "duration": 0.01865937514230609, "outcome": "passed" }, "call": { - "duration": 0.7479908330133185, + "duration": 0.7515070410445333, "outcome": "passed" }, "teardown": { - "duration": 0.0004037501057609916, + "duration": 0.0002985831815749407, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-earth]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[gpt-4o-earth]", @@ -301,21 +441,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.021671707974746823, + "duration": 0.011108374921604991, "outcome": "passed" }, "call": { - "duration": 0.6701172919711098, + "duration": 0.3914629169739783, "outcome": "passed" }, "teardown": { - "duration": 0.0005569590721279383, + "duration": 0.0006979589816182852, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[gpt-4o-saturn]", @@ -334,21 +474,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.015847125090658665, + "duration": 0.02875337516888976, "outcome": "passed" }, "call": { - "duration": 0.636536999954842, + "duration": 0.5632798750884831, "outcome": "passed" }, "teardown": { - "duration": 0.00029395800083875656, + "duration": 0.004012458026409149, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-earth]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[gpt-4o-mini-earth]", @@ -367,21 +507,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.011792832985520363, + "duration": 0.0143584581092, "outcome": "passed" }, "call": { - "duration": 0.5610962919890881, + "duration": 0.36101250001229346, "outcome": "passed" }, "teardown": { - "duration": 0.0003578749019652605, + "duration": 0.0005384159740060568, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[gpt-4o-mini-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[gpt-4o-mini-saturn]", @@ -400,21 +540,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.016500207944773138, + "duration": 0.017127499915659428, "outcome": "passed" }, "call": { - "duration": 0.8060244580265135, + "duration": 0.8120857500471175, "outcome": "passed" }, "teardown": { - "duration": 0.0005296670133247972, + "duration": 0.0005928750615566969, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-case0]", - "lineno": 116, + "lineno": 117, "outcome": "passed", "keywords": [ "test_chat_non_streaming_image[gpt-4o-case0]", @@ -433,21 +573,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.008338792016729712, + "duration": 0.023183667100965977, "outcome": "passed" }, "call": { - "duration": 7.009252917021513, + "duration": 2.8612758750095963, "outcome": "passed" }, "teardown": { - "duration": 0.0003042910248041153, + "duration": 0.0005042918492108583, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[gpt-4o-mini-case0]", - "lineno": 116, + "lineno": 117, "outcome": "passed", "keywords": [ "test_chat_non_streaming_image[gpt-4o-mini-case0]", @@ -466,21 +606,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.007238540914840996, + "duration": 0.007410250138491392, "outcome": "passed" }, "call": { - "duration": 3.134693874977529, + "duration": 2.3748936660122126, "outcome": "passed" }, "teardown": { - "duration": 0.0003104590578004718, + "duration": 0.00045658298768103123, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-case0]", - "lineno": 135, + "lineno": 136, "outcome": "passed", "keywords": [ "test_chat_streaming_image[gpt-4o-case0]", @@ -499,21 +639,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.0161851670127362, + "duration": 0.023792708991095424, "outcome": "passed" }, "call": { - "duration": 3.0745719589758664, + "duration": 3.1502402499318123, "outcome": "passed" }, "teardown": { - "duration": 0.00022620800882577896, + "duration": 0.0010152498725801706, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[gpt-4o-mini-case0]", - "lineno": 135, + "lineno": 136, "outcome": "passed", "keywords": [ "test_chat_streaming_image[gpt-4o-mini-case0]", @@ -532,21 +672,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.013220708002336323, + "duration": 0.01887162495404482, "outcome": "passed" }, "call": { - "duration": 3.624867417034693, + "duration": 2.070013999938965, "outcome": "passed" }, "teardown": { - "duration": 0.00020633300300687551, + "duration": 0.0005797501653432846, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[gpt-4o-calendar]", @@ -565,21 +705,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.017596833989955485, + "duration": 0.017477875109761953, "outcome": "passed" }, "call": { - "duration": 1.248568250099197, + "duration": 0.7350135410670191, "outcome": "passed" }, "teardown": { - "duration": 0.0004248750628903508, + "duration": 0.00046616699546575546, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[gpt-4o-math]", @@ -598,21 +738,21 @@ "case_id": "math" }, "setup": { - "duration": 0.01512012502644211, + "duration": 0.033007249934598804, "outcome": "passed" }, "call": { - "duration": 8.170285542029887, + "duration": 5.031138291116804, "outcome": "passed" }, "teardown": { - "duration": 0.00043537491001188755, + "duration": 0.00032295798882842064, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[gpt-4o-mini-calendar]", @@ -631,21 +771,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.010376665974035859, + "duration": 0.014672457939013839, "outcome": "passed" }, "call": { - "duration": 0.756480542011559, + "duration": 0.7515842081047595, "outcome": "passed" }, "teardown": { - "duration": 0.00025695806834846735, + "duration": 0.00034395791590213776, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[gpt-4o-mini-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[gpt-4o-mini-math]", @@ -664,21 +804,21 @@ "case_id": "math" }, "setup": { - "duration": 0.006846625008620322, + "duration": 0.02985133300535381, "outcome": "passed" }, "call": { - "duration": 2.6833953330060467, + "duration": 2.388004041975364, "outcome": "passed" }, "teardown": { - "duration": 0.00022558309137821198, + "duration": 0.00038116704672574997, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[gpt-4o-calendar]", @@ -697,21 +837,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.009646040969528258, + "duration": 0.017887332942336798, "outcome": "passed" }, "call": { - "duration": 0.6117532079806551, + "duration": 1.0018641669303179, "outcome": "passed" }, "teardown": { - "duration": 0.00015258300118148327, + "duration": 0.0005486670415848494, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-math]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[gpt-4o-math]", @@ -730,21 +870,21 @@ "case_id": "math" }, "setup": { - "duration": 0.012024458032101393, + "duration": 0.0158015841152519, "outcome": "passed" }, "call": { - "duration": 4.522625041077845, + "duration": 7.285852208966389, "outcome": "passed" }, "teardown": { - "duration": 0.0004230838967487216, + "duration": 0.0003417080733925104, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[gpt-4o-mini-calendar]", @@ -763,21 +903,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.009566582972183824, + "duration": 0.014434333890676498, "outcome": "passed" }, "call": { - "duration": 2.5591942919418216, + "duration": 0.9268912919797003, "outcome": "passed" }, "teardown": { - "duration": 0.0007555419579148293, + "duration": 0.00046200002543628216, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[gpt-4o-mini-math]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[gpt-4o-mini-math]", @@ -796,21 +936,21 @@ "case_id": "math" }, "setup": { - "duration": 0.010828875005245209, + "duration": 0.01635808404535055, "outcome": "passed" }, "call": { - "duration": 2.495122667052783, + "duration": 3.7341703751590103, "outcome": "passed" }, "teardown": { - "duration": 0.0002802090020850301, + "duration": 0.0004277920816093683, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-case0]", - "lineno": 204, + "lineno": 205, "outcome": "passed", "keywords": [ "test_chat_non_streaming_tool_calling[gpt-4o-case0]", @@ -829,21 +969,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.012762792059220374, + "duration": 0.021756208036094904, "outcome": "passed" }, "call": { - "duration": 0.5655921660363674, + "duration": 0.6105514578521252, "outcome": "passed" }, "teardown": { - "duration": 0.00022304197773337364, + "duration": 0.0004747910425066948, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]", - "lineno": 204, + "lineno": 205, "outcome": "passed", "keywords": [ "test_chat_non_streaming_tool_calling[gpt-4o-mini-case0]", @@ -862,21 +1002,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.03188708401285112, + "duration": 0.015522167086601257, "outcome": "passed" }, "call": { - "duration": 0.6159415419679135, + "duration": 0.9731334580574185, "outcome": "passed" }, "teardown": { - "duration": 0.0005549580091610551, + "duration": 0.0003415420651435852, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-case0]", - "lineno": 228, + "lineno": 229, "outcome": "passed", "keywords": [ "test_chat_streaming_tool_calling[gpt-4o-case0]", @@ -895,21 +1035,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.014768208027817309, + "duration": 0.014343583025038242, "outcome": "passed" }, "call": { - "duration": 0.47373537498060614, + "duration": 0.5453979168087244, "outcome": "passed" }, "teardown": { - "duration": 0.0005811670562252402, + "duration": 0.0011145840398967266, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[gpt-4o-mini-case0]", - "lineno": 228, + "lineno": 229, "outcome": "passed", "keywords": [ "test_chat_streaming_tool_calling[gpt-4o-mini-case0]", @@ -928,18 +1068,942 @@ "case_id": "case0" }, "setup": { - "duration": 0.010271625011228025, + "duration": 0.017669249791651964, "outcome": "passed" }, "call": { - "duration": 0.5656027499353513, + "duration": 0.6310562079306692, "outcome": "passed" }, "teardown": { - "duration": 0.0025699170073494315, + "duration": 0.0006836249958723783, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[gpt-4o-case0]", + "lineno": 257, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[gpt-4o-case0]", + "parametrize", + "pytestmark", + "gpt-4o-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "case0" + }, + "setup": { + "duration": 0.016614832915365696, + "outcome": "passed" + }, + "call": { + "duration": 0.6914504591841251, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004829999525099993, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[gpt-4o-mini-case0]", + "lineno": 257, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[gpt-4o-mini-case0]", + "parametrize", + "pytestmark", + "gpt-4o-mini-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "case0" + }, + "setup": { + "duration": 0.03217837493866682, + "outcome": "passed" + }, + "call": { + "duration": 0.4917086660861969, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005399580113589764, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[gpt-4o-case0]", + "lineno": 281, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_required[gpt-4o-case0]", + "parametrize", + "pytestmark", + "gpt-4o-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "case0" + }, + "setup": { + "duration": 0.01154208299703896, + "outcome": "passed" + }, + "call": { + "duration": 0.5663661658763885, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0008221250027418137, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[gpt-4o-mini-case0]", + "lineno": 281, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_required[gpt-4o-mini-case0]", + "parametrize", + "pytestmark", + "gpt-4o-mini-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "case0" + }, + "setup": { + "duration": 0.013238833984360099, + "outcome": "passed" + }, + "call": { + "duration": 0.6098562499973923, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00045654200948774815, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[gpt-4o-case0]", + "lineno": 308, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[gpt-4o-case0]", + "parametrize", + "pytestmark", + "gpt-4o-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "case0" + }, + "setup": { + "duration": 0.014951375080272555, + "outcome": "passed" + }, + "call": { + "duration": 0.5425659997854382, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0002112078946083784, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[gpt-4o-mini-case0]", + "lineno": 308, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[gpt-4o-mini-case0]", + "parametrize", + "pytestmark", + "gpt-4o-mini-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "case0" + }, + "setup": { + "duration": 0.010041083907708526, + "outcome": "passed" + }, + "call": { + "duration": 0.7337456250097603, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00042791711166501045, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[gpt-4o-case0]", + "lineno": 331, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_none[gpt-4o-case0]", + "parametrize", + "pytestmark", + "gpt-4o-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "case0" + }, + "setup": { + "duration": 0.007236667210236192, + "outcome": "passed" + }, + "call": { + "duration": 0.4192167909350246, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0010569579899311066, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[gpt-4o-mini-case0]", + "lineno": 331, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_none[gpt-4o-mini-case0]", + "parametrize", + "pytestmark", + "gpt-4o-mini-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "case0" + }, + "setup": { + "duration": 0.01997062494046986, + "outcome": "passed" + }, + "call": { + "duration": 0.6866283339913934, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0010521251242607832, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-text_then_weather_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-text_then_weather_tool]", + "parametrize", + "pytestmark", + "gpt-4o-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.017386124935001135, + "outcome": "passed" + }, + "call": { + "duration": 4.425433791941032, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00043645803816616535, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-weather_tool_then_text]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-weather_tool_then_text]", + "parametrize", + "pytestmark", + "gpt-4o-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.014067957876250148, + "outcome": "passed" + }, + "call": { + "duration": 1.205255625071004, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004651669878512621, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-add_product_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-add_product_tool]", + "parametrize", + "pytestmark", + "gpt-4o-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.016634040977805853, + "outcome": "passed" + }, + "call": { + "duration": 1.4360020828898996, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004704580642282963, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-get_then_create_event_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "gpt-4o-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.015702415956184268, + "outcome": "passed" + }, + "call": { + "duration": 5.882555708056316, + "outcome": "passed" + }, + "teardown": { + "duration": 0.003662874922156334, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "gpt-4o-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.020038041984662414, + "outcome": "passed" + }, + "call": { + "duration": 2.2738899998366833, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004929169081151485, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-text_then_weather_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-text_then_weather_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.007982166949659586, + "outcome": "passed" + }, + "call": { + "duration": 1.7494398748967797, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005488330498337746, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-weather_tool_then_text]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-weather_tool_then_text]", + "parametrize", + "pytestmark", + "gpt-4o-mini-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.007455583196133375, + "outcome": "passed" + }, + "call": { + "duration": 5.338647875003517, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005507499445229769, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-add_product_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-add_product_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.01675066608004272, + "outcome": "passed" + }, + "call": { + "duration": 4.016703582834452, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005397920031100512, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-get_then_create_event_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.009890957968309522, + "outcome": "passed" + }, + "call": { + "duration": 3.9003724998328835, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005802921950817108, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[gpt-4o-mini-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.021778207970783114, + "outcome": "passed" + }, + "call": { + "duration": 2.3824402918107808, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0008852919563651085, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-text_then_weather_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-text_then_weather_tool]", + "parametrize", + "pytestmark", + "gpt-4o-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.021121500059962273, + "outcome": "passed" + }, + "call": { + "duration": 2.362067250069231, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0007184590213000774, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-weather_tool_then_text]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-weather_tool_then_text]", + "parametrize", + "pytestmark", + "gpt-4o-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.01677604205906391, + "outcome": "passed" + }, + "call": { + "duration": 1.4576394581235945, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005367500707507133, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-add_product_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-add_product_tool]", + "parametrize", + "pytestmark", + "gpt-4o-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.010623916983604431, + "outcome": "passed" + }, + "call": { + "duration": 3.295967958169058, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005429999437183142, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-get_then_create_event_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "gpt-4o-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.014912083046510816, + "outcome": "passed" + }, + "call": { + "duration": 2.7422334579750896, + "outcome": "passed" + }, + "teardown": { + "duration": 0.001017916016280651, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "gpt-4o-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.014568000100553036, + "outcome": "passed" + }, + "call": { + "duration": 2.4006296249572188, + "outcome": "passed" + }, + "teardown": { + "duration": 0.000492083141580224, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-text_then_weather_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-text_then_weather_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.01243741693906486, + "outcome": "passed" + }, + "call": { + "duration": 1.858031083131209, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0012166248634457588, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-weather_tool_then_text]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-weather_tool_then_text]", + "parametrize", + "pytestmark", + "gpt-4o-mini-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.017216125037521124, + "outcome": "passed" + }, + "call": { + "duration": 1.4033057920169085, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00047016702592372894, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-add_product_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-add_product_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.019779917085543275, + "outcome": "passed" + }, + "call": { + "duration": 1.5427470421418548, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0007832080591470003, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-get_then_create_event_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.019053417025133967, + "outcome": "passed" + }, + "call": { + "duration": 4.038398916134611, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00048545910976827145, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[gpt-4o-mini-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "gpt-4o-mini-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "gpt-4o-mini", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.01692862482741475, + "outcome": "passed" + }, + "call": { + "duration": 1.849576957989484, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0032055408228188753, "outcome": "passed" } } ], - "run_timestamp": 1744328848 + "run_timestamp": 1744679391 } diff --git a/tests/verifications/test_results/together.json b/tests/verifications/test_results/together.json index 2b23089e8..44e831936 100644 --- a/tests/verifications/test_results/together.json +++ b/tests/verifications/test_results/together.json @@ -1,15 +1,15 @@ { - "created": 1744328847.853437, - "duration": 49.9419469833374, + "created": 1744679387.346831, + "duration": 90.31976795196533, "exitcode": 1, "root": "/Users/erichuang/projects/llama-stack", "environment": {}, "summary": { - "passed": 22, - "failed": 12, + "passed": 37, + "failed": 39, "skipped": 2, - "total": 36, - "collected": 36 + "total": 78, + "collected": 78 }, "collectors": [ { @@ -29,182 +29,392 @@ { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]", "type": "Function", - "lineno": 73 + "lineno": 74 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]", "type": "Function", - "lineno": 92 + "lineno": 93 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", "type": "Function", - "lineno": 116 + "lineno": 117 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", "type": "Function", - "lineno": 135 + "lineno": 136 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]", "type": "Function", - "lineno": 159 + "lineno": 160 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]", "type": "Function", - "lineno": 182 + "lineno": 183 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", "type": "Function", - "lineno": 204 + "lineno": 205 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", "type": "Function", - "lineno": 228 + "lineno": 229 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "type": "Function", + "lineno": 257 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "type": "Function", + "lineno": 281 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "type": "Function", + "lineno": 308 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "type": "Function", + "lineno": 331 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 359 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]", + "type": "Function", + "lineno": 450 + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]", + "type": "Function", + "lineno": 450 } ] } @@ -212,7 +422,7 @@ "tests": [ { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]", @@ -231,21 +441,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.15774220903404057, + "duration": 0.1559112500399351, "outcome": "passed" }, "call": { - "duration": 0.5396400419995189, + "duration": 0.3692209171131253, "outcome": "passed" }, "teardown": { - "duration": 0.0002977499971166253, + "duration": 0.00021362490952014923, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]", @@ -264,21 +474,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.015632833004929125, + "duration": 0.007326166843995452, "outcome": "passed" }, "call": { - "duration": 0.4675290420418605, + "duration": 0.49173945817165077, "outcome": "passed" }, "teardown": { - "duration": 0.00029129208996891975, + "duration": 0.00034487503580749035, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]", @@ -297,21 +507,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.01530187507160008, + "duration": 0.021014458034187555, "outcome": "passed" }, "call": { - "duration": 0.501894542016089, + "duration": 0.36956487502902746, "outcome": "passed" }, "teardown": { - "duration": 0.0002060839906334877, + "duration": 0.0007119579240679741, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]", @@ -330,21 +540,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.014841833035461605, + "duration": 0.011922625126317143, "outcome": "passed" }, "call": { - "duration": 0.4202229160582647, + "duration": 2.7763332079630345, "outcome": "passed" }, "teardown": { - "duration": 0.0005559159908443689, + "duration": 0.0004842919297516346, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]", @@ -363,21 +573,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.008204624988138676, + "duration": 0.023896750062704086, "outcome": "passed" }, "call": { - "duration": 1.991508833016269, + "duration": 0.9817597079090774, "outcome": "passed" }, "teardown": { - "duration": 0.000539042055606842, + "duration": 0.0004768748767673969, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]", - "lineno": 73, + "lineno": 74, "outcome": "passed", "keywords": [ "test_chat_non_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]", @@ -396,21 +606,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.022528667002916336, + "duration": 0.07423937506973743, "outcome": "passed" }, "call": { - "duration": 0.37111237505450845, + "duration": 0.3721332079730928, "outcome": "passed" }, "teardown": { - "duration": 0.0005334159359335899, + "duration": 0.00020033284090459347, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-earth]", @@ -429,21 +639,21 @@ "case_id": "earth" }, "setup": { - "duration": 0.00922920904122293, + "duration": 0.010166750056669116, "outcome": "passed" }, "call": { - "duration": 1.1684916669037193, + "duration": 0.41266337502747774, "outcome": "passed" }, "teardown": { - "duration": 0.0002740409690886736, + "duration": 0.00034358282573521137, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "passed", "keywords": [ "test_chat_streaming_basic[meta-llama/Llama-3.3-70B-Instruct-Turbo-saturn]", @@ -462,21 +672,21 @@ "case_id": "saturn" }, "setup": { - "duration": 0.010883333045057952, + "duration": 0.016687541967257857, "outcome": "passed" }, "call": { - "duration": 0.4275277080014348, + "duration": 0.7235856249462813, "outcome": "passed" }, "teardown": { - "duration": 0.00043112505227327347, + "duration": 0.00027179205790162086, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]", - "lineno": 92, + "lineno": 93, "outcome": "failed", "keywords": [ "test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-earth]", @@ -495,34 +705,34 @@ "case_id": "earth" }, "setup": { - "duration": 0.012945958063937724, + "duration": 0.012556416913866997, "outcome": "passed" }, "call": { - "duration": 0.5551295839250088, + "duration": 0.27039612480439246, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:110: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError" }, "teardown": { - "duration": 0.0002744169905781746, + "duration": 0.0002312080468982458, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "failed", "keywords": [ "test_chat_streaming_basic[meta-llama/Llama-4-Scout-17B-16E-Instruct-saturn]", @@ -541,34 +751,34 @@ "case_id": "saturn" }, "setup": { - "duration": 0.017372542060911655, + "duration": 0.006413874914869666, "outcome": "passed" }, "call": { - "duration": 0.3579877089941874, + "duration": 0.36463545891456306, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:110: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError" }, "teardown": { - "duration": 0.0005445419810712337, + "duration": 0.00023154192604124546, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]", - "lineno": 92, + "lineno": 93, "outcome": "failed", "keywords": [ "test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-earth]", @@ -587,34 +797,34 @@ "case_id": "earth" }, "setup": { - "duration": 0.014297832967713475, + "duration": 0.015633082948625088, "outcome": "passed" }, "call": { - "duration": 0.8067362919682637, + "duration": 0.8896284159272909, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:110: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'earth', 'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError" }, "teardown": { - "duration": 0.0003220830112695694, + "duration": 0.0006587498355656862, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]", - "lineno": 92, + "lineno": 93, "outcome": "failed", "keywords": [ "test_chat_streaming_basic[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-saturn]", @@ -633,34 +843,34 @@ "case_id": "saturn" }, "setup": { - "duration": 0.008816750021651387, + "duration": 0.012669583084061742, "outcome": "passed" }, "call": { - "duration": 0.5383605000097305, + "duration": 0.3499396659899503, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 110, + "lineno": 111, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:110: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'saturn', 'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_basic(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:111: IndexError" }, "teardown": { - "duration": 0.00018316600471735, + "duration": 0.00024912506341934204, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", - "lineno": 116, + "lineno": 117, "outcome": "skipped", "keywords": [ "test_chat_non_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", @@ -679,22 +889,22 @@ "case_id": "case0" }, "setup": { - "duration": 0.0074389580404385924, + "duration": 0.0153201250359416, "outcome": "passed" }, "call": { - "duration": 0.00014933396596461535, + "duration": 0.0001901669893413782, "outcome": "skipped", - "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 125, 'Skipped: Skipping test_chat_non_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')" + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 126, 'Skipped: Skipping test_chat_non_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')" }, "teardown": { - "duration": 0.00012462493032217026, + "duration": 0.00012779212556779385, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", - "lineno": 116, + "lineno": 117, "outcome": "passed", "keywords": [ "test_chat_non_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", @@ -713,21 +923,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.013580625061877072, + "duration": 0.008855124935507774, "outcome": "passed" }, "call": { - "duration": 2.89831429196056, + "duration": 1.37906050006859, "outcome": "passed" }, "teardown": { - "duration": 0.000491458922624588, + "duration": 0.0004904591478407383, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", - "lineno": 116, + "lineno": 117, "outcome": "passed", "keywords": [ "test_chat_non_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", @@ -746,21 +956,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.008266666904091835, + "duration": 0.017166708130389452, "outcome": "passed" }, "call": { - "duration": 3.8873212080216035, + "duration": 4.003400916932151, "outcome": "passed" }, "teardown": { - "duration": 0.00016850000247359276, + "duration": 0.00042724981904029846, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", - "lineno": 135, + "lineno": 136, "outcome": "skipped", "keywords": [ "test_chat_streaming_image[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", @@ -779,22 +989,22 @@ "case_id": "case0" }, "setup": { - "duration": 0.0080461660400033, + "duration": 0.007232750067487359, "outcome": "passed" }, "call": { - "duration": 0.00014758307952433825, + "duration": 0.0001449580304324627, "outcome": "skipped", - "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 144, 'Skipped: Skipping test_chat_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')" + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py', 145, 'Skipped: Skipping test_chat_streaming_image for model meta-llama/Llama-3.3-70B-Instruct-Turbo on provider together based on config.')" }, "teardown": { - "duration": 0.00012695800978690386, + "duration": 0.0001349160447716713, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", - "lineno": 135, + "lineno": 136, "outcome": "failed", "keywords": [ "test_chat_streaming_image[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", @@ -813,34 +1023,34 @@ "case_id": "case0" }, "setup": { - "duration": 0.00845700001809746, + "duration": 0.007052165921777487, "outcome": "passed" }, "call": { - "duration": 1.6604419159702957, + "duration": 1.4663615000899881, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 153, + "lineno": 154, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 153, + "lineno": 154, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:153: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:154: IndexError" }, "teardown": { - "duration": 0.00033458403777331114, + "duration": 0.0005696250591427088, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", - "lineno": 135, + "lineno": 136, "outcome": "failed", "keywords": [ "test_chat_streaming_image[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", @@ -859,34 +1069,34 @@ "case_id": "case0" }, "setup": { - "duration": 0.012580333976075053, + "duration": 0.01214433298446238, "outcome": "passed" }, "call": { - "duration": 4.728511792025529, + "duration": 3.902559082955122, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 153, + "lineno": 154, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 153, + "lineno": 154, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:153: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_image(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:154: IndexError" }, "teardown": { - "duration": 0.00023266696371138096, + "duration": 0.000591374933719635, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]", @@ -905,21 +1115,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.011554082971997559, + "duration": 0.01478054211474955, "outcome": "passed" }, "call": { - "duration": 1.3857994999270886, + "duration": 0.569845792138949, "outcome": "passed" }, "teardown": { - "duration": 0.0003951250109821558, + "duration": 0.00038724998012185097, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]", @@ -938,21 +1148,21 @@ "case_id": "math" }, "setup": { - "duration": 0.007673708954825997, + "duration": 0.014717916958034039, "outcome": "passed" }, "call": { - "duration": 3.082161583006382, + "duration": 1.1819656670559198, "outcome": "passed" }, "teardown": { - "duration": 0.0002532500075176358, + "duration": 0.0002410421147942543, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]", @@ -971,21 +1181,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.014791041961871088, + "duration": 0.006486707832664251, "outcome": "passed" }, "call": { - "duration": 0.6918012499809265, + "duration": 0.5623017910402268, "outcome": "passed" }, "teardown": { - "duration": 0.00027070799842476845, + "duration": 0.00032504182308912277, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]", @@ -1004,21 +1214,21 @@ "case_id": "math" }, "setup": { - "duration": 0.014746625092811882, + "duration": 0.009171125013381243, "outcome": "passed" }, "call": { - "duration": 3.5890139170223847, + "duration": 2.6005691669415683, "outcome": "passed" }, "teardown": { - "duration": 0.00030137505382299423, + "duration": 0.00023995805531740189, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]", @@ -1037,21 +1247,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.036798374960199, + "duration": 0.009700333932414651, "outcome": "passed" }, "call": { - "duration": 0.6914895409718156, + "duration": 0.4192442081402987, "outcome": "passed" }, "teardown": { - "duration": 0.00023716699797660112, + "duration": 0.00040241610258817673, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]", - "lineno": 159, + "lineno": 160, "outcome": "passed", "keywords": [ "test_chat_non_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]", @@ -1070,21 +1280,21 @@ "case_id": "math" }, "setup": { - "duration": 0.05965254199691117, + "duration": 0.006938542006537318, "outcome": "passed" }, "call": { - "duration": 2.609581291093491, + "duration": 2.1736337919719517, "outcome": "passed" }, "teardown": { - "duration": 0.0002674580318853259, + "duration": 0.00019279099069535732, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-calendar]", @@ -1103,21 +1313,21 @@ "case_id": "calendar" }, "setup": { - "duration": 0.014533916022628546, + "duration": 0.008775749942287803, "outcome": "passed" }, "call": { - "duration": 0.6227063750848174, + "duration": 0.5588400410488248, "outcome": "passed" }, "teardown": { - "duration": 0.00019699998665601015, + "duration": 0.00040091690607368946, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]", - "lineno": 182, + "lineno": 183, "outcome": "passed", "keywords": [ "test_chat_streaming_structured_output[meta-llama/Llama-3.3-70B-Instruct-Turbo-math]", @@ -1136,21 +1346,21 @@ "case_id": "math" }, "setup": { - "duration": 0.009818125050514936, + "duration": 0.01844154205173254, "outcome": "passed" }, "call": { - "duration": 5.144610875053331, + "duration": 2.205772665794939, "outcome": "passed" }, "teardown": { - "duration": 0.00045220903120934963, + "duration": 0.00021091708913445473, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "failed", "keywords": [ "test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-calendar]", @@ -1169,34 +1379,34 @@ "case_id": "calendar" }, "setup": { - "duration": 0.012392290984280407, + "duration": 0.015595750184729695, "outcome": "passed" }, "call": { - "duration": 0.777625665999949, + "duration": 0.6904467919375747, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:201: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError" }, "teardown": { - "duration": 0.000559916952624917, + "duration": 0.0002907498273998499, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]", - "lineno": 182, + "lineno": 183, "outcome": "failed", "keywords": [ "test_chat_streaming_structured_output[meta-llama/Llama-4-Scout-17B-16E-Instruct-math]", @@ -1215,34 +1425,34 @@ "case_id": "math" }, "setup": { - "duration": 0.010390624986030161, + "duration": 0.008272957988083363, "outcome": "passed" }, "call": { - "duration": 2.680094916955568, + "duration": 3.499622541014105, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:201: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError" }, "teardown": { - "duration": 0.00041987502481788397, + "duration": 0.0005947079043835402, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]", - "lineno": 182, + "lineno": 183, "outcome": "failed", "keywords": [ "test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-calendar]", @@ -1261,34 +1471,34 @@ "case_id": "calendar" }, "setup": { - "duration": 0.01190529193263501, + "duration": 0.013340875040739775, "outcome": "passed" }, "call": { - "duration": 0.6690819580107927, + "duration": 0.42789591709151864, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:201: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'calendar', 'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'cont...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError" }, "teardown": { - "duration": 0.000247166957706213, + "duration": 0.0003039578441530466, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]", - "lineno": 182, + "lineno": 183, "outcome": "failed", "keywords": [ "test_chat_streaming_structured_output[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-math]", @@ -1307,34 +1517,34 @@ "case_id": "math" }, "setup": { - "duration": 0.009588208980858326, + "duration": 0.01058275019749999, "outcome": "passed" }, "call": { - "duration": 2.4867218340514228, + "duration": 5.795635707909241, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 201, + "lineno": 202, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:201: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'math', 'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solut... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_structured_output(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n response_format=case[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:202: IndexError" }, "teardown": { - "duration": 0.00022487505339086056, + "duration": 0.0005178749561309814, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", - "lineno": 204, + "lineno": 205, "outcome": "passed", "keywords": [ "test_chat_non_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", @@ -1353,21 +1563,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.008509417064487934, + "duration": 0.014336749911308289, "outcome": "passed" }, "call": { - "duration": 0.45511841599363834, + "duration": 0.451304541900754, "outcome": "passed" }, "teardown": { - "duration": 0.00031033402774482965, + "duration": 0.0004718329291790724, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", - "lineno": 204, + "lineno": 205, "outcome": "passed", "keywords": [ "test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", @@ -1386,21 +1596,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.01352791697718203, + "duration": 0.01625004201196134, "outcome": "passed" }, "call": { - "duration": 0.7166531670372933, + "duration": 0.5111537908669561, "outcome": "passed" }, "teardown": { - "duration": 0.00031470798421651125, + "duration": 0.00046774977818131447, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", - "lineno": 204, + "lineno": 205, "outcome": "passed", "keywords": [ "test_chat_non_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", @@ -1419,21 +1629,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.01369225000962615, + "duration": 0.015832332894206047, "outcome": "passed" }, "call": { - "duration": 0.34134254103992134, + "duration": 0.8238586660008878, "outcome": "passed" }, "teardown": { - "duration": 0.0002922919811680913, + "duration": 0.0006185418460518122, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", - "lineno": 228, + "lineno": 229, "outcome": "passed", "keywords": [ "test_chat_streaming_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", @@ -1452,21 +1662,21 @@ "case_id": "case0" }, "setup": { - "duration": 0.025748749962076545, + "duration": 0.007832166040316224, "outcome": "passed" }, "call": { - "duration": 0.7462511250050738, + "duration": 0.685583250131458, "outcome": "passed" }, "teardown": { - "duration": 0.00030449999030679464, + "duration": 0.0004414590075612068, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", - "lineno": 228, + "lineno": 229, "outcome": "failed", "keywords": [ "test_chat_streaming_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", @@ -1485,34 +1695,39 @@ "case_id": "case0" }, "setup": { - "duration": 0.015131957945413888, + "duration": 0.021764083998277783, "outcome": "passed" }, "call": { - "duration": 0.4556894999695942, + "duration": 0.35617320891469717, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 251, + "lineno": 587, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 251, + "lineno": 247, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n # Accumulate partial tool_calls here\n tool_calls_buffer = {}\n current_id = None\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:251: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:247: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" }, "teardown": { - "duration": 0.000539042055606842, + "duration": 0.0005425831768661737, "outcome": "passed" } }, { "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", - "lineno": 228, + "lineno": 229, "outcome": "failed", "keywords": [ "test_chat_streaming_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", @@ -1531,31 +1746,1833 @@ "case_id": "case0" }, "setup": { - "duration": 0.016429082956165075, + "duration": 0.016708041075617075, "outcome": "passed" }, "call": { - "duration": 0.3677835420239717, + "duration": 0.49443637509830296, "outcome": "failed", "crash": { "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", - "lineno": 251, + "lineno": 587, "message": "IndexError: list index out of range" }, "traceback": [ { "path": "tests/verifications/openai_api/test_chat_completion.py", - "lineno": 251, + "lineno": 247, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, "message": "IndexError" } ], - "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...el_display_names': {'gpt-4o': 'gpt-4o', 'gpt-4o-mini': 'gpt-4o-mini'}, 'models': ['gpt-4o', 'gpt-4o-mini'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n # Accumulate partial tool_calls here\n tool_calls_buffer = {}\n current_id = None\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:251: IndexError" + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"],\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_calling(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:247: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" }, "teardown": { - "duration": 0.001610000035725534, + "duration": 0.0002642078325152397, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "lineno": 257, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "case0" + }, + "setup": { + "duration": 0.009570583933964372, + "outcome": "passed" + }, + "call": { + "duration": 0.5232214999850839, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006591668352484703, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "lineno": 257, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.01567283389158547, + "outcome": "passed" + }, + "call": { + "duration": 0.4465816249139607, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0003922500181943178, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "lineno": 257, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "case0" + }, + "setup": { + "duration": 0.021711332956328988, + "outcome": "passed" + }, + "call": { + "duration": 0.5361095829866827, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0003099590539932251, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "lineno": 281, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_tool_choice_required[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "case0" + }, + "setup": { + "duration": 0.009334125090390444, + "outcome": "passed" + }, + "call": { + "duration": 0.5789772500284016, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00037712487392127514, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "lineno": 281, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.019614499993622303, + "outcome": "passed" + }, + "call": { + "duration": 0.444399792002514, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 300, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:300: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.0004192921333014965, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "lineno": 281, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_tool_choice_required[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "case0" + }, + "setup": { + "duration": 0.012822834076359868, + "outcome": "passed" + }, + "call": { + "duration": 0.6777042911853641, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 300, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_required(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"required\", # Force tool call\n stream=True,\n )\n \n> _, tool_calls_buffer = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:300: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.0004483328666538, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "lineno": 308, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "case0" + }, + "setup": { + "duration": 0.011924332939088345, + "outcome": "passed" + }, + "call": { + "duration": 0.4756374170538038, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 328, + "message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]), seed=13421903014786785000).message" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 328, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_nfd8oz9wmhlwf4mqglcaokrs', function=Function(arguments='{\"location\":\"San Francisco, USA\"}', name='get_weather'), type='function', index=0)]), seed=13421903014786785000).message\n\ntests/verifications/openai_api/test_chat_completion.py:328: AssertionError" + }, + "teardown": { + "duration": 0.0004585420247167349, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "lineno": 308, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.013246082933619618, + "outcome": "passed" + }, + "call": { + "duration": 0.5618870409671217, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 328, + "message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]), seed=None).message" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 328, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_h5u55eksczab7xtg8oot43k5', function=Function(arguments='{\"location\":\"San Francisco, United States\"}', name='get_weather'), type='function', index=0)]), seed=None).message\n\ntests/verifications/openai_api/test_chat_completion.py:328: AssertionError" + }, + "teardown": { + "duration": 0.00025883293710649014, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "lineno": 308, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "case0" + }, + "setup": { + "duration": 0.008055417099967599, + "outcome": "passed" + }, + "call": { + "duration": 0.32869591703638434, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 328, + "message": "AssertionError: Expected no tool calls when tool_choice='none'\nassert [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\n + where [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\n + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 328, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_non_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n response = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert response.choices[0].message.tool_calls is None, \"Expected no tool calls when tool_choice='none'\"\nE AssertionError: Expected no tool calls when tool_choice='none'\nE assert [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] is None\nE + where [ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]).tool_calls\nE + where ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]) = Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_s1bz9v57b8uizqy2i81869pb', function=Function(arguments='{\"location\":\"San Francisco\"}', name='get_weather'), type='function', index=0)]), seed=None).message\n\ntests/verifications/openai_api/test_chat_completion.py:328: AssertionError" + }, + "teardown": { + "duration": 0.0003937501460313797, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "lineno": 331, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_tool_choice_none[meta-llama/Llama-3.3-70B-Instruct-Turbo-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "case0" + }, + "setup": { + "duration": 0.013460749993100762, + "outcome": "passed" + }, + "call": { + "duration": 0.35879983310587704, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 355, + "message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 355, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_q472clmnii99ps1fxqtv8qvr', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:355: AssertionError" + }, + "teardown": { + "duration": 0.0002649170346558094, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "lineno": 331, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Scout-17B-16E-Instruct-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "case0" + }, + "setup": { + "duration": 0.0068365419283509254, + "outcome": "passed" + }, + "call": { + "duration": 0.5351063329726458, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 355, + "message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 355, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_l3roc57o2pn9b70f0dcgil53', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:355: AssertionError" + }, + "teardown": { + "duration": 0.0004712918307632208, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "lineno": 331, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_tool_choice_none[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-case0", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "case0" + }, + "setup": { + "duration": 0.014073874801397324, + "outcome": "passed" + }, + "call": { + "duration": 0.6729549579322338, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 355, + "message": "AssertionError: Expected no tool call chunks when tool_choice='none'\nassert not [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\n + where [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 355, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"case\"], # Reusing existing case for now\n ids=case_id_generator,\n )\n def test_chat_streaming_tool_choice_none(request, openai_client, model, provider, verification_config, case):\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n stream = openai_client.chat.completions.create(\n model=model,\n messages=case[\"input\"][\"messages\"],\n tools=case[\"input\"][\"tools\"],\n tool_choice=\"none\",\n stream=True,\n )\n \n content = \"\"\n for chunk in stream:\n delta = chunk.choices[0].delta\n if delta.content:\n content += delta.content\n> assert not delta.tool_calls, \"Expected no tool call chunks when tool_choice='none'\"\nE AssertionError: Expected no tool call chunks when tool_choice='none'\nE assert not [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]\nE + where [ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')] = ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=[ChoiceDeltaToolCall(index=0, id='call_ktw831i0p838mzvnnaylf6fp', function=ChoiceDeltaToolCallFunction(arguments='', name='get_weather'), type='function')]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:355: AssertionError" + }, + "teardown": { + "duration": 0.000251916004344821, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.009340125136077404, + "outcome": "passed" + }, + "call": { + "duration": 0.3328715830575675, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 0 tool calls, but got 1\nassert 1 == 0\n + where 1 = len(([ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]))\n + where [ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 0 tool calls, but got 1\nE assert 1 == 0\nE + where 1 = len(([ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]))\nE + where [ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_3rr948zuvun0533y4oyyep0z', function=Function(arguments='{\"location\":\"San Francisco, CA\"}', name='get_weather'), type='function', index=0)]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.00042020808905363083, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.01490145898424089, + "outcome": "passed" + }, + "call": { + "duration": 0.8346118750050664, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00034404080361127853, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.014493625145405531, + "outcome": "passed" + }, + "call": { + "duration": 0.8973606249783188, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00021345820277929306, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.009358166949823499, + "outcome": "passed" + }, + "call": { + "duration": 4.5295154170598835, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0002461671829223633, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.009552374947816133, + "outcome": "passed" + }, + "call": { + "duration": 0.34176899981684983, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 429, + "message": "AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nassert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\n \n Differing items:\n {'month': '1'} != {'month': 1}\n {'year': '2025'} != {'year': 2025}\n \n Full diff:\n {...\n \n ...Full output truncated (7 lines hidden), use '-vv' to show" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 429, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n> assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\nE AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nE assert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\nE \nE Differing items:\nE {'month': '1'} != {'month': 1}\nE {'year': '2025'} != {'year': 2025}\nE \nE Full diff:\nE {...\nE \nE ...Full output truncated (7 lines hidden), use '-vv' to show\n\ntests/verifications/openai_api/test_chat_completion.py:429: AssertionError" + }, + "teardown": { + "duration": 0.000527665950357914, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.012501416960731149, + "outcome": "passed" + }, + "call": { + "duration": 1.585734374821186, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError: Expected 0 tool calls, but got 2\nassert 2 == 0\n + where 2 = len(([ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]))\n + where [ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]).tool_calls" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 418, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n> assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\nE AssertionError: Expected 0 tool calls, but got 2\nE assert 2 == 0\nE + where 2 = len(([ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]))\nE + where [ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)] = ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_4fm3kj059swz9no94n6fg54d', function=Function(arguments='{\"location\":\"Sun, NA\"}', name='get_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_lzc5lo7y2p7wjyquvmvvzt64', function=Function(arguments='{\"name\":\"Sun\"}', name='get_latin_name'), type='function', index=1)]).tool_calls\n\ntests/verifications/openai_api/test_chat_completion.py:418: AssertionError" + }, + "teardown": { + "duration": 0.0003941669128835201, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.014057958032935858, + "outcome": "passed" + }, + "call": { + "duration": 0.7121559998486191, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00048266700468957424, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.02072141715325415, + "outcome": "passed" + }, + "call": { + "duration": 1.0424797078594565, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004878339823335409, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.018570583080872893, + "outcome": "passed" + }, + "call": { + "duration": 3.4340267919469625, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00023016706109046936, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.009570334106683731, + "outcome": "passed" + }, + "call": { + "duration": 2.2068665840197355, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00051837507635355, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.01873366697691381, + "outcome": "passed" + }, + "call": { + "duration": 0.5193468749057502, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 446, + "message": "AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": null, \"parameters\": null}'\nassert False\n + where False = any(. at 0x10e4c0f90>)" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 446, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call.id,\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n assert assistant_message.content is not None, \"Expected content, but none received.\"\n expected_answers = expected[\"answer\"] # This is now a list\n content_lower = assistant_message.content.lower()\n> assert any(ans.lower() in content_lower for ans in expected_answers), (\n f\"Expected one of {expected_answers} in content, but got: '{assistant_message.content}'\"\n )\nE AssertionError: Expected one of ['sol'] in content, but got: '{\"name\": null, \"parameters\": null}'\nE assert False\nE + where False = any(. at 0x10e4c0f90>)\n\ntests/verifications/openai_api/test_chat_completion.py:446: AssertionError" + }, + "teardown": { + "duration": 0.0004933748859912157, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.014272749889642, + "outcome": "passed" + }, + "call": { + "duration": 1.911199334077537, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00043049990199506283, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.031040542060509324, + "outcome": "passed" + }, + "call": { + "duration": 3.0026419160421938, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00045104208402335644, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]", + "lineno": 359, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.016529500018805265, + "outcome": "passed" + }, + "call": { + "duration": 2.7563346249517053, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 429, + "message": "AssertionError: Expected arguments '{'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00', 'location': 'Main Conference Room', 'participants': ['Alice', 'Bob', 'Charlie']}', got '{'participants': '[\"Alice\", \"Bob\", \"Charlie\"]', 'location': 'Main Conference Room', 'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00'}'\nassert {'date': '202...arlie\"]', ...} == {'date': '202...harlie'], ...}\n \n Omitting 4 identical items, use -vv to show\n Differing items:\n {'participants': '[\"Alice\", \"Bob\", \"Charlie\"]'} != {'participants': ['Alice', 'Bob', 'Charlie']}\n \n Full diff:\n {...\n \n ...Full output truncated (11 lines hidden), use '-vv' to show" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 429, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_non_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\"\n Test cases for multi-turn tool calling.\n Tool calls are asserted.\n Tool responses are provided in the test case.\n Final response is asserted.\n \"\"\"\n \n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n # Create a copy of the messages list to avoid modifying the original\n messages = []\n tools = case[\"input\"][\"tools\"]\n # Use deepcopy to prevent modification across runs/parametrization\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n # keep going until either\n # 1. we have messages to test in multi-turn\n # 2. no messages but last message is tool response\n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n # do not take new messages if last message is tool response\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n # Ensure new_messages is a list of message objects\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n # If it's a single message object, add it directly\n messages.append(new_messages)\n \n # --- API Call ---\n response = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=False,\n )\n \n # --- Process Response ---\n assistant_message = response.choices[0].message\n messages.append(assistant_message.model_dump(exclude_unset=True))\n \n assert assistant_message.role == \"assistant\"\n \n # Get the expected result data\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n # --- Assertions based on expected result ---\n assert len(assistant_message.tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(assistant_message.tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n tool_call = assistant_message.tool_calls[0]\n assert tool_call.function.name == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call.function.name}'\"\n )\n # Parse the JSON string arguments before comparing\n actual_arguments = json.loads(tool_call.function.arguments)\n> assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\nE AssertionError: Expected arguments '{'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00', 'location': 'Main Conference Room', 'participants': ['Alice', 'Bob', 'Charlie']}', got '{'participants': '[\"Alice\", \"Bob\", \"Charlie\"]', 'location': 'Main Conference Room', 'name': 'Team Building', 'date': '2025-03-03', 'time': '10:00'}'\nE assert {'date': '202...arlie\"]', ...} == {'date': '202...harlie'], ...}\nE \nE Omitting 4 identical items, use -vv to show\nE Differing items:\nE {'participants': '[\"Alice\", \"Bob\", \"Charlie\"]'} != {'participants': ['Alice', 'Bob', 'Charlie']}\nE \nE Full diff:\nE {...\nE \nE ...Full output truncated (11 lines hidden), use '-vv' to show\n\ntests/verifications/openai_api/test_chat_completion.py:429: AssertionError" + }, + "teardown": { + "duration": 0.0005542081780731678, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]", + "lineno": 359, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.013607957866042852, + "outcome": "passed" + }, + "call": { + "duration": 3.0105869588442147, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004793750122189522, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.01806124998256564, + "outcome": "passed" + }, + "call": { + "duration": 0.3295827910769731, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError: Expected 0 tool calls, but got 1\nassert 1 == 0\n + where 1 = len(([{'function': {'arguments': '{\"location\":\"San Francisco, CA\"}', 'name': 'get_weather'}, 'id': 'call_l066e8oey2i8exeodczlv1mh', 'type': 'function'}]))" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 500, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n> assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\nE AssertionError: Expected 0 tool calls, but got 1\nE assert 1 == 0\nE + where 1 = len(([{'function': {'arguments': '{\"location\":\"San Francisco, CA\"}', 'name': 'get_weather'}, 'id': 'call_l066e8oey2i8exeodczlv1mh', 'type': 'function'}]))\n\ntests/verifications/openai_api/test_chat_completion.py:500: AssertionError" + }, + "teardown": { + "duration": 0.0002942080609500408, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.007637625094503164, + "outcome": "passed" + }, + "call": { + "duration": 2.021851292112842, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 526, + "message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 526, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:526: AssertionError" + }, + "teardown": { + "duration": 0.00036791712045669556, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]", + "lineno": 450, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.013031583046540618, + "outcome": "passed" + }, + "call": { + "duration": 0.8596610419917852, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00042829103767871857, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.015244666952639818, + "outcome": "passed" + }, + "call": { + "duration": 1.0227877080906183, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 526, + "message": "AssertionError: Expected content, but none received.\nassert ('' is not None and '' != '')" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 526, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\n \n # Prepare and append the tool response for the next turn\n tool_response = tool_responses.pop(0)\n messages.append(\n {\n \"role\": \"tool\",\n \"tool_call_id\": tool_call[\"id\"],\n \"content\": tool_response[\"response\"],\n }\n )\n else:\n> assert accumulated_content is not None and accumulated_content != \"\", \"Expected content, but none received.\"\nE AssertionError: Expected content, but none received.\nE assert ('' is not None and '' != '')\n\ntests/verifications/openai_api/test_chat_completion.py:526: AssertionError" + }, + "teardown": { + "duration": 0.00024933391250669956, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-3.3-70B-Instruct-Turbo-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.008626125054433942, + "outcome": "passed" + }, + "call": { + "duration": 0.3212552920449525, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 512, + "message": "AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nassert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\n \n Differing items:\n {'month': '1'} != {'month': 1}\n {'year': '2025'} != {'year': 2025}\n \n Full diff:\n {...\n \n ...Full output truncated (7 lines hidden), use '-vv' to show" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 512, + "message": "AssertionError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-3.3-70B-Instruct-Turbo', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n \n # --- Construct Assistant Message for History ---\n assistant_message_dict = {\"role\": \"assistant\"}\n if accumulated_content:\n assistant_message_dict[\"content\"] = accumulated_content\n if accumulated_tool_calls:\n assistant_message_dict[\"tool_calls\"] = accumulated_tool_calls\n \n messages.append(assistant_message_dict)\n \n # --- Assertions ---\n expected = expected_results.pop(0)\n num_tool_calls = expected[\"num_tool_calls\"]\n \n assert len(accumulated_tool_calls or []) == num_tool_calls, (\n f\"Expected {num_tool_calls} tool calls, but got {len(accumulated_tool_calls or [])}\"\n )\n \n if num_tool_calls > 0:\n # Use the first accumulated tool call for assertion\n tool_call = accumulated_tool_calls[0]\n assert tool_call[\"function\"][\"name\"] == expected[\"tool_name\"], (\n f\"Expected tool '{expected['tool_name']}', got '{tool_call['function']['name']}'\"\n )\n # Parse the accumulated arguments string for comparison\n actual_arguments = json.loads(tool_call[\"function\"][\"arguments\"])\n> assert actual_arguments == expected[\"tool_arguments\"], (\n f\"Expected arguments '{expected['tool_arguments']}', got '{actual_arguments}'\"\n )\nE AssertionError: Expected arguments '{'month': 1, 'year': 2025}', got '{'month': '1', 'year': '2025'}'\nE assert {'month': '1', 'year': '2025'} == {'month': 1, 'year': 2025}\nE \nE Differing items:\nE {'month': '1'} != {'month': 1}\nE {'year': '2025'} != {'year': 2025}\nE \nE Full diff:\nE {...\nE \nE ...Full output truncated (7 lines hidden), use '-vv' to show\n\ntests/verifications/openai_api/test_chat_completion.py:512: AssertionError" + }, + "teardown": { + "duration": 0.00020562508143484592, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.007338125025853515, + "outcome": "passed" + }, + "call": { + "duration": 0.4175920831039548, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.00023462506942451, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.007788832997903228, + "outcome": "passed" + }, + "call": { + "duration": 0.45610866602510214, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.00021450011990964413, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.006751166889443994, + "outcome": "passed" + }, + "call": { + "duration": 0.7053082089405507, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.00021783309057354927, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.008729791967198253, + "outcome": "passed" + }, + "call": { + "duration": 0.5665898330044001, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.0002288338728249073, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Scout-17B-16E-Instruct-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.009526000125333667, + "outcome": "passed" + }, + "call": { + "duration": 1.1714977910742164, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Scout-17B-16E-Instruct', provider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.00032483390532433987, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-text_then_weather_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "text_then_weather_tool" + }, + "setup": { + "duration": 0.010107750073075294, + "outcome": "passed" + }, + "call": { + "duration": 0.26202141703106463, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'text_then_weather_tool', 'expected': [{'answer': ['sol'], 'num_tool_calls': 0}, {'num_tool_calls': 1, 'to...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.00022558285854756832, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-weather_tool_then_text", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "weather_tool_then_text" + }, + "setup": { + "duration": 0.008256082888692617, + "outcome": "passed" + }, + "call": { + "duration": 0.3466235001105815, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'weather_tool_then_text', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'location': 'San Francisco...], 'type': 'object'}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': '70 degrees and foggy'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.000535458093509078, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-add_product_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "add_product_tool" + }, + "setup": { + "duration": 0.0180504999589175, + "outcome": "passed" + }, + "call": { + "duration": 1.8803812500555068, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'add_product_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'inStock': True, 'name': 'Widget...}}, 'type': 'function'}]}, 'tool_responses': [{'response': \"{'response': 'Successfully added product with id: 123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.00025062495842576027, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-get_then_create_event_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "get_then_create_event_tool" + }, + "setup": { + "duration": 0.00993091706186533, + "outcome": "passed" + }, + "call": { + "duration": 0.5258524999953806, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'get_then_create_event_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'date': '2025-03-03', ...ents found for 2025-03-03 at 10:00'}\"}, {'response': \"{'response': 'Successfully created new event with id: e_123'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.0002823749091476202, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai_api/test_chat_completion.py::test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]", + "lineno": 450, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_multi_turn_tool_calling[meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool]", + "parametrize", + "pytestmark", + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8-compare_monthly_expense_tool", + "test_chat_completion.py", + "openai_api", + "verifications", + "tests", + "llama-stack", + "" + ], + "metadata": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "case_id": "compare_monthly_expense_tool" + }, + "setup": { + "duration": 0.047535917023196816, + "outcome": "passed" + }, + "call": { + "duration": 0.4426498331595212, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 485, + "message": "" + }, + { + "path": "tests/verifications/openai_api/test_chat_completion.py", + "lineno": 587, + "message": "IndexError" + } + ], + "longrepr": "request = >\nopenai_client = \nmodel = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\nprovider = 'together'\nverification_config = {'providers': {'cerebras': {'api_key_var': 'CEREBRAS_API_KEY', 'base_url': 'https://api.cerebras.ai/v1', 'model_displa...-versatile', 'meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct'], ...}, ...}}\ncase = {'case_id': 'compare_monthly_expense_tool', 'expected': [{'num_tool_calls': 1, 'tool_arguments': {'month': 1, 'year': ... 'Total expenses for January 2025: $1000'}\"}, {'response': \"{'response': 'Total expenses for February 2024: $2000'}\"}]}\n\n @pytest.mark.parametrize(\n \"case\",\n chat_completion_test_cases.get(\"test_chat_multi_turn_tool_calling\", {}).get(\"test_params\", {}).get(\"case\", []),\n ids=case_id_generator,\n )\n def test_chat_streaming_multi_turn_tool_calling(request, openai_client, model, provider, verification_config, case):\n \"\"\" \"\"\"\n test_name_base = get_base_test_name(request)\n if should_skip_test(verification_config, provider, model, test_name_base):\n pytest.skip(f\"Skipping {test_name_base} for model {model} on provider {provider} based on config.\")\n \n messages = []\n tools = case[\"input\"][\"tools\"]\n expected_results = copy.deepcopy(case[\"expected\"])\n tool_responses = copy.deepcopy(case.get(\"tool_responses\", []))\n input_messages_turns = copy.deepcopy(case[\"input\"][\"messages\"])\n \n while len(input_messages_turns) > 0 or (len(messages) > 0 and messages[-1][\"role\"] == \"tool\"):\n if len(messages) == 0 or messages[-1][\"role\"] != \"tool\":\n new_messages = input_messages_turns.pop(0)\n if isinstance(new_messages, list):\n messages.extend(new_messages)\n else:\n messages.append(new_messages)\n \n # --- API Call (Streaming) ---\n stream = openai_client.chat.completions.create(\n model=model,\n messages=messages,\n tools=tools,\n stream=True,\n )\n \n # --- Process Stream ---\n> accumulated_content, accumulated_tool_calls = _accumulate_streaming_tool_calls(stream)\n\ntests/verifications/openai_api/test_chat_completion.py:485: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nstream = \n\n def _accumulate_streaming_tool_calls(stream):\n \"\"\"Accumulates tool calls and content from a streaming ChatCompletion response.\"\"\"\n tool_calls_buffer = {}\n current_id = None\n full_content = \"\" # Initialize content accumulator\n # Process streaming chunks\n for chunk in stream:\n> choice = chunk.choices[0]\nE IndexError: list index out of range\n\ntests/verifications/openai_api/test_chat_completion.py:587: IndexError" + }, + "teardown": { + "duration": 0.0010368749499320984, "outcome": "passed" } } ], - "run_timestamp": 1744328795 + "run_timestamp": 1744679294 }