feat: Implement the 'max_tool_calls' parameter for the Responses API (#4062)

# Problem Responses API uses max_tool_calls parameter to limit the number of tool calls that can be generated in a response. Currently, LLS implementation of the Responses API does not support this parameter. # What does this PR do? This pull request adds the max_tool_calls field to the response object definition and updates the inline provider. it also ensures that: - the total number of calls to built-in and mcp tools do not exceed max_tool_calls - an error is thrown if max_tool_calls < 1 (behavior seen with the OpenAI Responses API, but we can change this if needed) Closes #[3563](https://github.com/llamastack/llama-stack/issues/3563) ## Test Plan - Tested manually for change in model response w.r.t supplied max_tool_calls field. - Added integration tests to test invalid max_tool_calls parameter. - Added integration tests to check max_tool_calls parameter with built-in and function tools. - Added integration tests to check max_tool_calls parameter in the returned response object. - Recorded OpenAI Responses API behavior using a sample script: https://github.com/s-akhtar-baig/llama-stack-examples/blob/main/responses/src/max_tool_calls.py Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
2025-12-03 09:53:45 +00:00 · 2025-11-10 16:21:27 -05:00 · 2025-11-10 16:21:27 -05:00 · 433438cfc0
commit 433438cfc0
parent 209a78b618
9 changed files with 240 additions and 2 deletions
--- a/tests/integration/agents/test_openai_responses.py
+++ b/tests/integration/agents/test_openai_responses.py
@ -516,3 +516,169 @@ def test_response_with_instructions(openai_client, client_with_models, text_mode

    # Verify instructions from previous response was not carried over to the next response
    assert response_with_instructions2.instructions == instructions2
+
+
+@pytest.mark.skip(reason="Tool calling is not reliable.")
+def test_max_tool_calls_with_function_tools(openai_client, client_with_models, text_model_id):
+    """Test handling of max_tool_calls with function tools in responses."""
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
+
+    client = openai_client
+    max_tool_calls = 1
+
+    tools = [
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description": "Get weather information for a specified location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city name (e.g., 'New York', 'London')",
+                    },
+                },
+            },
+        },
+        {
+            "type": "function",
+            "name": "get_time",
+            "description": "Get current time for a specified location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city name (e.g., 'New York', 'London')",
+                    },
+                },
+            },
+        },
+    ]
+
+    # First create a response that triggers function tools
+    response = client.responses.create(
+        model=text_model_id,
+        input="Can you tell me the weather in Paris and the current time?",
+        tools=tools,
+        stream=False,
+        max_tool_calls=max_tool_calls,
+    )
+
+    # Verify we got two function calls and that the max_tool_calls do not affect function tools
+    assert len(response.output) == 2
+    assert response.output[0].type == "function_call"
+    assert response.output[0].name == "get_weather"
+    assert response.output[0].status == "completed"
+    assert response.output[1].type == "function_call"
+    assert response.output[1].name == "get_time"
+    assert response.output[0].status == "completed"
+
+    # Verify we have a valid max_tool_calls field
+    assert response.max_tool_calls == max_tool_calls
+
+
+def test_max_tool_calls_invalid(openai_client, client_with_models, text_model_id):
+    """Test handling of invalid max_tool_calls in responses."""
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
+
+    client = openai_client
+
+    input = "Search for today's top technology news."
+    invalid_max_tool_calls = 0
+    tools = [
+        {"type": "web_search"},
+    ]
+
+    # Create a response with an invalid max_tool_calls value i.e. 0
+    # Handle ValueError from LLS and BadRequestError from OpenAI client
+    with pytest.raises((ValueError, BadRequestError)) as excinfo:
+        client.responses.create(
+            model=text_model_id,
+            input=input,
+            tools=tools,
+            stream=False,
+            max_tool_calls=invalid_max_tool_calls,
+        )
+
+    error_message = str(excinfo.value)
+    assert f"Invalid max_tool_calls={invalid_max_tool_calls}; should be >= 1" in error_message, (
+        f"Expected error message about invalid max_tool_calls, got: {error_message}"
+    )
+
+
+def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, text_model_id):
+    """Test handling of max_tool_calls with built-in tools in responses."""
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
+
+    client = openai_client
+
+    input = "Search for today's top technology and a positive news story. You MUST make exactly two separate web search calls."
+    max_tool_calls = [1, 5]
+    tools = [
+        {"type": "web_search"},
+    ]
+
+    # First create a response that triggers web_search tools without max_tool_calls
+    response = client.responses.create(
+        model=text_model_id,
+        input=input,
+        tools=tools,
+        stream=False,
+    )
+
+    # Verify we got two web search calls followed by a message
+    assert len(response.output) == 3
+    assert response.output[0].type == "web_search_call"
+    assert response.output[0].status == "completed"
+    assert response.output[1].type == "web_search_call"
+    assert response.output[1].status == "completed"
+    assert response.output[2].type == "message"
+    assert response.output[2].status == "completed"
+    assert response.output[2].role == "assistant"
+
+    # Next create a response that triggers web_search tools with max_tool_calls set to 1
+    response_2 = client.responses.create(
+        model=text_model_id,
+        input=input,
+        tools=tools,
+        stream=False,
+        max_tool_calls=max_tool_calls[0],
+    )
+
+    # Verify we got one web search tool call followed by a message
+    assert len(response_2.output) == 2
+    assert response_2.output[0].type == "web_search_call"
+    assert response_2.output[0].status == "completed"
+    assert response_2.output[1].type == "message"
+    assert response_2.output[1].status == "completed"
+    assert response_2.output[1].role == "assistant"
+
+    # Verify we have a valid max_tool_calls field
+    assert response_2.max_tool_calls == max_tool_calls[0]
+
+    # Finally create a response that triggers web_search tools with max_tool_calls set to 5
+    response_3 = client.responses.create(
+        model=text_model_id,
+        input=input,
+        tools=tools,
+        stream=False,
+        max_tool_calls=max_tool_calls[1],
+    )
+
+    # Verify we got two web search calls followed by a message
+    assert len(response_3.output) == 3
+    assert response_3.output[0].type == "web_search_call"
+    assert response_3.output[0].status == "completed"
+    assert response_3.output[1].type == "web_search_call"
+    assert response_3.output[1].status == "completed"
+    assert response_3.output[2].type == "message"
+    assert response_3.output[2].status == "completed"
+    assert response_3.output[2].role == "assistant"
+
+    # Verify we have a valid max_tool_calls field
+    assert response_3.max_tool_calls == max_tool_calls[1]