fix: Fix max_tool_calls for openai provider and add integration tests for the max_tool_calls feat (#4190)

# Problem

OpenAI gpt-4 returned an error when built-in and mcp calls were skipped
due to max_tool_calls parameter. Following is from the server log:
```
RuntimeError: OpenAI response failed: Error code: 400 - {'error': {'message': "An assistant message with       
'tool_calls' must be followed by tool messages responding to each 'tool_call_id'. The following tool_call_ids  
did not have response messages: call_Yi9V1QNpN73dJCAgP2Arcjej", 'type': 'invalid_request_error', 'param':      
'messages', 'code': None}}
```

# What does this PR do?

- Fixes error returned by openai/gpt when calls were skipped due to
max_tool_calls. We now return a tool message that explicitly mentions
that the call is skipped.
- Adds integration tests as a follow-up to
PR#[4062](https://github.com/llamastack/llama-stack/pull/4062)

<!-- If resolving an issue, uncomment and update the line below -->
Part 2 for issue
#[3563](https://github.com/llamastack/llama-stack/issues/3563)

## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->

- Added integration tests
- Added new recordings

---------

Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
Shabana Baig 2025-11-19 13:27:56 -05:00 committed by GitHub
parent f18870a221
commit 72ea95e2e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 8386 additions and 168 deletions

View file

@ -600,3 +600,155 @@ def test_response_streaming_multi_turn_tool_execution(responses_client, text_mod
assert expected_output.lower() in final_response.output_text.lower(), (
f"Expected '{expected_output}' to appear in response: {final_response.output_text}"
)
def test_max_tool_calls_with_function_tools(responses_client, text_model_id):
"""Test handling of max_tool_calls with function tools in responses."""
max_tool_calls = 1
tools = [
{
"type": "function",
"name": "get_weather",
"description": "Get weather information for a specified location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city name (e.g., 'New York', 'London')",
},
},
},
},
{
"type": "function",
"name": "get_time",
"description": "Get current time for a specified location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city name (e.g., 'New York', 'London')",
},
},
},
},
]
response = responses_client.responses.create(
model=text_model_id,
input="Can you tell me the weather in Paris and the current time?",
tools=tools,
stream=False,
max_tool_calls=max_tool_calls,
)
# Verify we got two function calls and that the max_tool_calls does not affect function tools
assert len(response.output) == 2
assert response.output[0].type == "function_call"
assert response.output[0].name == "get_weather"
assert response.output[0].status == "completed"
assert response.output[1].type == "function_call"
assert response.output[1].name == "get_time"
assert response.output[1].status == "completed"
# Verify we have a valid max_tool_calls field
assert response.max_tool_calls == max_tool_calls
def test_max_tool_calls_invalid(responses_client, text_model_id):
"""Test handling of invalid max_tool_calls in responses."""
input = "Search for today's top technology news."
invalid_max_tool_calls = 0
tools = [
{"type": "web_search"},
]
# Create a response with an invalid max_tool_calls value i.e. 0
# Handle ValueError from LLS and BadRequestError from OpenAI client
with pytest.raises((ValueError, llama_stack_client.BadRequestError, openai.BadRequestError)) as excinfo:
responses_client.responses.create(
model=text_model_id,
input=input,
tools=tools,
stream=False,
max_tool_calls=invalid_max_tool_calls,
)
error_message = str(excinfo.value)
assert f"Invalid max_tool_calls={invalid_max_tool_calls}; should be >= 1" in error_message, (
f"Expected error message about invalid max_tool_calls, got: {error_message}"
)
def test_max_tool_calls_with_mcp_tools(responses_client, text_model_id):
"""Test handling of max_tool_calls with mcp tools in responses."""
with make_mcp_server(tools=dependency_tools()) as mcp_server_info:
input = "Get the experiment ID for 'boiling_point' and get the user ID for 'charlie'"
max_tool_calls = [1, 5]
tools = [
{"type": "mcp", "server_label": "localmcp", "server_url": mcp_server_info["server_url"]},
]
# First create a response that triggers mcp tools without max_tool_calls
response = responses_client.responses.create(
model=text_model_id,
input=input,
tools=tools,
stream=False,
)
# Verify we got two mcp tool calls followed by a message
assert len(response.output) == 4
mcp_list_tools = [output for output in response.output if output.type == "mcp_list_tools"]
mcp_calls = [output for output in response.output if output.type == "mcp_call"]
message_outputs = [output for output in response.output if output.type == "message"]
assert len(mcp_list_tools) == 1
assert len(mcp_calls) == 2, f"Expected two mcp calls, got {len(mcp_calls)}"
assert len(message_outputs) == 1, f"Expected one message output, got {len(message_outputs)}"
# Next create a response that triggers mcp tools with max_tool_calls set to 1
response_2 = responses_client.responses.create(
model=text_model_id,
input=input,
tools=tools,
stream=False,
max_tool_calls=max_tool_calls[0],
)
# Verify we got one mcp tool call followed by a message
assert len(response_2.output) == 3
mcp_list_tools = [output for output in response_2.output if output.type == "mcp_list_tools"]
mcp_calls = [output for output in response_2.output if output.type == "mcp_call"]
message_outputs = [output for output in response_2.output if output.type == "message"]
assert len(mcp_list_tools) == 1
assert len(mcp_calls) == 1, f"Expected one mcp call, got {len(mcp_calls)}"
assert len(message_outputs) == 1, f"Expected one message output, got {len(message_outputs)}"
# Verify we have a valid max_tool_calls field
assert response_2.max_tool_calls == max_tool_calls[0]
# Finally create a response that triggers mcp tools with max_tool_calls set to 5
response_3 = responses_client.responses.create(
model=text_model_id,
input=input,
tools=tools,
stream=False,
max_tool_calls=max_tool_calls[1],
)
# Verify we got two mcp tool calls followed by a message
assert len(response_3.output) == 4
mcp_list_tools = [output for output in response_3.output if output.type == "mcp_list_tools"]
mcp_calls = [output for output in response_3.output if output.type == "mcp_call"]
message_outputs = [output for output in response_3.output if output.type == "message"]
assert len(mcp_list_tools) == 1
assert len(mcp_calls) == 2, f"Expected two mcp calls, got {len(mcp_calls)}"
assert len(message_outputs) == 1, f"Expected one message output, got {len(message_outputs)}"
# Verify we have a valid max_tool_calls field
assert response_3.max_tool_calls == max_tool_calls[1]